Example #1
0
def test_cosine_hungarian_with_tolerance_2_0():
    """Compare output cosine score for tolerance 2.0 with own calculation on simple dummy spectrums."""
    spectrum_1 = Spectrum(mz=numpy.array([100, 299, 300, 301, 510],
                                         dtype="float"),
                          intensities=numpy.array([0.1, 1.0, 0.2, 0.3, 0.4],
                                                  dtype="float"))

    spectrum_2 = Spectrum(mz=numpy.array([100, 300, 301, 511], dtype="float"),
                          intensities=numpy.array([0.1, 1.0, 0.3, 0.4],
                                                  dtype="float"))
    cosine_hungarian = CosineHungarian(tolerance=2.0)
    score, n_matches = cosine_hungarian.pair(spectrum_1, spectrum_2)

    # Derive expected cosine score
    expected_matches = [[0, 1, 3, 4], [
        0, 1, 2, 3
    ]]  # Those peaks have matching mz values (within given tolerance)
    multiply_matching_intensities = spectrum_1.peaks.intensities[expected_matches[0]] \
        * spectrum_2.peaks.intensities[expected_matches[1]]
    denominator = numpy.sqrt((spectrum_1.peaks.intensities ** 2).sum()) \
        * numpy.sqrt((spectrum_2.peaks.intensities ** 2).sum())
    expected_score = multiply_matching_intensities.sum() / denominator

    assert score == pytest.approx(expected_score,
                                  0.0001), "Expected different cosine score."
    assert n_matches == len(
        expected_matches[0]), "Expected different number of matching peaks."
Example #2
0
def test_cosine_hungarian_without_parameters():
    """Compare output cosine score with own calculation on simple dummy spectrums."""
    spectrum_1 = Spectrum(mz=numpy.array([100, 200, 300, 500, 510],
                                         dtype="float"),
                          intensities=numpy.array([0.1, 0.2, 1.0, 0.3, 0.4],
                                                  dtype="float"))

    spectrum_2 = Spectrum(mz=numpy.array([100, 200, 290, 490, 510],
                                         dtype="float"),
                          intensities=numpy.array([0.1, 0.2, 1.0, 0.3, 0.4],
                                                  dtype="float"))
    cosine_hungarian = CosineHungarian()
    score = cosine_hungarian.pair(spectrum_1, spectrum_2)

    # Derive expected cosine score
    expected_matches = [
        0, 1, 4
    ]  # Those peaks have matching mz values (within given tolerance)
    multiply_matching_intensities = spectrum_1.peaks.intensities[expected_matches] \
        * spectrum_2.peaks.intensities[expected_matches]
    denominator = numpy.sqrt((spectrum_1.peaks.intensities ** 2).sum()) \
        * numpy.sqrt((spectrum_2.peaks.intensities ** 2).sum())
    expected_score = multiply_matching_intensities.sum() / denominator

    assert score["score"] == pytest.approx(
        expected_score, 0.0001), "Expected different cosine score."
    assert score["matches"] == len(
        expected_matches), "Expected different number of matching peaks."
Example #3
0
def test_cosine_hungarian_case_without_matches():
    """Test case for spectrums without any matching peaks."""
    spectrum_1 = Spectrum(mz=numpy.array([100, 200], dtype="float"),
                          intensities=numpy.array([1.0, 0.1], dtype="float"),
                          metadata={})

    spectrum_2 = Spectrum(mz=numpy.array([110, 210], dtype="float"),
                          intensities=numpy.array([1.0, 0.1], dtype="float"),
                          metadata={})

    cosine_hungarian = CosineHungarian()
    score, n_matches = cosine_hungarian.pair(spectrum_1, spectrum_2)
    assert score == 0.0, "Expected different cosine score."
    assert n_matches == 0, "Expected different number of matching peaks."
Example #4
0
def test_cosine_hungarian_case_where_greedy_would_fail():
    """Test case that would fail for cosine greedy implementations."""
    spectrum_1 = Spectrum(mz=numpy.array([100.005, 100.016], dtype="float"),
                          intensities=numpy.array([1.0, 0.9], dtype="float"),
                          metadata={})

    spectrum_2 = Spectrum(mz=numpy.array([100.005, 100.01], dtype="float"),
                          intensities=numpy.array([0.9, 1.0], dtype="float"),
                          metadata={})

    cosine_hungarian = CosineHungarian(tolerance=0.01)
    score, n_matches = cosine_hungarian.pair(spectrum_1, spectrum_2)
    assert score == pytest.approx(0.994475,
                                  0.0001), "Expected different cosine score."
    assert n_matches == 2, "Expected different number of matching peaks."
def library_match(spectra_list,
                  lib_mgf,
                  precursor_tol=1.0,
                  cosine=0.7,
                  n_peaks=3):
    """Reads a given library mgf file and matches the given spectra to the library spectra using normal cosine.
    Each test spectra is given the name of the library spectra match with the highest cosine score."""

    library = load_from_mgf(lib_mgf)

    # Apply filters to clean and enhance each spectrum
    library_spectra = []
    for spectrum in library:
        # spectrum = default_filters(spectrum)
        # Scale peak intensities to maximum of 1
        spectrum = normalize_intensities(spectrum)
        library_spectra.append(spectrum)

    scores = calculate_scores(references=library_spectra,
                              queries=spectra_list,
                              similarity_function=CosineHungarian())

    scores_list = []
    for score in scores:
        print(score)
        scores_list.append(score)

    scores_list.sort(reverse=True, key=lambda tuple: tuple[2])
Example #6
0
def test_cosine_hungarian_matrix_without_parameters():
    """Compare output cosine score with expected scores."""
    spectrum_1 = Spectrum(mz=numpy.array([100, 200, 300, 500, 510],
                                         dtype="float"),
                          intensities=numpy.array([0.1, 0.2, 1.0, 0.3, 0.4],
                                                  dtype="float"))

    spectrum_2 = Spectrum(mz=numpy.array([100, 200, 290, 490, 510],
                                         dtype="float"),
                          intensities=numpy.array([0.1, 0.2, 1.0, 0.3, 0.4],
                                                  dtype="float"))
    cosine_hungarian = CosineHungarian()
    scores = cosine_hungarian.matrix([spectrum_1, spectrum_2],
                                     [spectrum_1, spectrum_2])

    assert scores[0, 0] == scores[1, 1] == (
        1.0, 5), "Expected different cosine score."
    assert scores[0, 1] == scores[1, 0] == pytest.approx(
        (0.1615384, 3), 1e-6), "Expected different cosine score."
Example #7
0
def test_cosine_hungarian_order_of_arguments():
    """Compare cosine scores for A,B versus B,A, which should give the same score."""
    spectrum_1 = Spectrum(mz=numpy.array([100, 200, 299, 300, 301, 500, 510],
                                         dtype="float"),
                          intensities=numpy.array(
                              [0.02, 0.02, 1.0, 0.2, 0.4, 0.04, 0.2],
                              dtype="float"),
                          metadata=dict())

    spectrum_2 = Spectrum(
        mz=numpy.array([100, 200, 300, 301, 500, 512], dtype="float"),
        intensities=numpy.array([0.02, 0.02, 1.0, 0.2, 0.04, 0.2],
                                dtype="float"),
        metadata=dict())

    cosine_hungarian = CosineHungarian(tolerance=2.0)
    score_1_2, n_matches_1_2 = cosine_hungarian.pair(spectrum_1, spectrum_2)
    score_2_1, n_matches_2_1 = cosine_hungarian.pair(spectrum_2, spectrum_1)

    assert score_1_2 == score_2_1, "Expected that the order of the arguments would not matter."
    assert n_matches_1_2 == n_matches_2_1, "Expected that the order of the arguments would not matter."
Example #8
0
def test_cosine_hungarian_with_peak_powers():
    """Compare output cosine score with own calculation on simple dummy spectrums.
    Here testing the options to raise peak intensities to given powers.
    """
    mz_power = 0.5
    intensity_power = 2.0
    spectrum_1 = Spectrum(mz=numpy.array([100, 200, 300, 500, 510],
                                         dtype="float"),
                          intensities=numpy.array([0.1, 0.2, 1.0, 0.3, 0.4],
                                                  dtype="float"))

    spectrum_2 = Spectrum(mz=numpy.array([100, 200, 290, 490, 510],
                                         dtype="float"),
                          intensities=numpy.array([0.1, 0.2, 1.0, 0.3, 0.4],
                                                  dtype="float"))
    cosine_hungarian = CosineHungarian(tolerance=1.0,
                                       mz_power=mz_power,
                                       intensity_power=intensity_power)
    score, n_matches = cosine_hungarian.pair(spectrum_1, spectrum_2)

    # Derive expected cosine score
    matches = [
        0, 1, 4
    ]  # Those peaks have matching mz values (within given tolerance)
    intensity1 = spectrum_1.peaks.intensities
    mz1 = spectrum_1.peaks.mz
    intensity2 = spectrum_2.peaks.intensities
    mz2 = spectrum_2.peaks.mz
    multiply_matching_intensities = (mz1[matches] ** mz_power) * (intensity1[matches] ** intensity_power) \
        * (mz2[matches] ** mz_power) * (intensity2[matches] ** intensity_power)
    denominator = numpy.sqrt((((mz1 ** mz_power) * (intensity1 ** intensity_power)) ** 2).sum()) \
        * numpy.sqrt((((mz2 ** mz_power) * (intensity2 ** intensity_power)) ** 2).sum())
    expected_score = multiply_matching_intensities.sum() / denominator

    assert score == pytest.approx(expected_score,
                                  0.0001), "Expected different cosine score."
    assert n_matches == len(
        matches), "Expected different number of matching peaks."
Example #9
0
def main(argv):
    parser = argparse.ArgumentParser(
        description="Compute MSP similarity scores")
    parser.add_argument("-s",
                        dest="symmetric",
                        action='store_true',
                        help="Computation is symmetric.")
    parser.add_argument("--ref",
                        dest="references_filename",
                        type=str,
                        help="Path to reference spectra library.")
    parser.add_argument("--ref_format",
                        dest="references_format",
                        type=str,
                        help="Reference spectra library file format.")
    parser.add_argument("queries_filename",
                        type=str,
                        help="Path to query spectra.")
    parser.add_argument("queries_format",
                        type=str,
                        help="Query spectra file format.")
    parser.add_argument("similarity_metric",
                        type=str,
                        help='Metric to use for matching.')
    parser.add_argument("tolerance",
                        type=float,
                        help="Tolerance to use for peak matching.")
    parser.add_argument(
        "mz_power",
        type=float,
        help="The power to raise mz to in the cosine function.")
    parser.add_argument(
        "intensity_power",
        type=float,
        help="The power to raise intensity to in the cosine function.")
    parser.add_argument("output_filename_scores",
                        type=str,
                        help="Path where to store the output .tsv scores.")
    parser.add_argument("output_filename_matches",
                        type=str,
                        help="Path where to store the output .tsv matches.")
    args = parser.parse_args()

    if args.queries_format == 'msp':
        queries_spectra = list(load_from_msp(args.queries_filename))
    elif args.queries_format == 'mgf':
        queries_spectra = list(load_from_mgf(args.queries_filename))
    else:
        raise ValueError(
            f'File format {args.queries_format} not supported for query spectra.'
        )

    if args.symmetric:
        reference_spectra = []
    else:
        if args.references_format == 'msp':
            reference_spectra = list(load_from_msp(args.references_filename))
        elif args.references_format == 'mgf':
            reference_spectra = list(load_from_mgf(args.references_filename))
        else:
            raise ValueError(
                f'File format {args.references_format} not supported for reference spectra library.'
            )

    if args.similarity_metric == 'CosineGreedy':
        similarity_metric = CosineGreedy(args.tolerance, args.mz_power,
                                         args.intensity_power)
    elif args.similarity_metric == 'CosineHungarian':
        similarity_metric = CosineHungarian(args.tolerance, args.mz_power,
                                            args.intensity_power)
    elif args.similarity_metric == 'ModifiedCosine':
        similarity_metric = ModifiedCosine(args.tolerance, args.mz_power,
                                           args.intensity_power)
        reference_spectra = list(map(convert_precursor_mz, reference_spectra))
        queries_spectra = list(map(convert_precursor_mz, queries_spectra))
    else:
        return -1

    print("Calculating scores...")
    scores = calculate_scores(
        references=queries_spectra if args.symmetric else reference_spectra,
        queries=queries_spectra,
        similarity_function=similarity_metric,
        is_symmetric=args.symmetric)

    write_outputs(args, scores)
    return 0
Example #10
0
def main(argv):
    parser = argparse.ArgumentParser(
        description="Compute MSP similarity scores")
    parser.add_argument("-f",
                        dest="default_filters",
                        action='store_true',
                        help="Apply default filters")
    parser.add_argument("-n",
                        dest="normalize_intensities",
                        action='store_true',
                        help="Normalize intensities.")
    parser.add_argument("-s",
                        dest="symmetric",
                        action='store_true',
                        help="Computation is symmetric.")
    parser.add_argument("--ref",
                        dest="references_filename",
                        type=str,
                        help="Path to reference MSP library.")
    parser.add_argument("queries_filename",
                        type=str,
                        help="Path to query spectra.")
    parser.add_argument("similarity_metric",
                        type=str,
                        help='Metric to use for matching.')
    parser.add_argument("tolerance",
                        type=float,
                        help="Tolerance to use for peak matching.")
    parser.add_argument(
        "mz_power",
        type=float,
        help="The power to raise mz to in the cosine function.")
    parser.add_argument(
        "intensity_power",
        type=float,
        help="The power to raise intensity to in the cosine function.")
    parser.add_argument("output_filename_scores",
                        type=str,
                        help="Path where to store the output .csv scores.")
    parser.add_argument("output_filename_matches",
                        type=str,
                        help="Path where to store the output .csv matches.")
    args = parser.parse_args()

    queries_spectra = list(load_from_msp(args.queries_filename))
    if args.symmetric:
        reference_spectra = []
    else:
        reference_spectra = list(load_from_msp(args.references_filename))

    if args.default_filters is True:
        print("Applying default filters...")
        queries_spectra = list(map(default_filters, queries_spectra))
        reference_spectra = list(map(default_filters, reference_spectra))

    if args.normalize_intensities is True:
        print("Normalizing intensities...")
        queries_spectra = list(map(normalize_intensities, queries_spectra))
        reference_spectra = list(map(normalize_intensities, reference_spectra))

    if args.similarity_metric == 'CosineGreedy':
        similarity_metric = CosineGreedy(args.tolerance, args.mz_power,
                                         args.intensity_power)
    elif args.similarity_metric == 'CosineHungarian':
        similarity_metric = CosineHungarian(args.tolerance, args.mz_power,
                                            args.intensity_power)
    elif args.similarity_metric == 'ModifiedCosine':
        similarity_metric = ModifiedCosine(args.tolerance, args.mz_power,
                                           args.intensity_power)
        reference_spectra = list(map(add_precursor_mz, reference_spectra))
        queries_spectra = list(map(add_precursor_mz, queries_spectra))
    else:
        return -1

    print("Calculating scores...")
    scores = calculate_scores(
        references=queries_spectra if args.symmetric else reference_spectra,
        queries=queries_spectra,
        similarity_function=similarity_metric,
        is_symmetric=args.symmetric)

    write_outputs(args, scores)
    return 0
Example #11
0
def main(argv):
    parser = argparse.ArgumentParser(
        description="Compute MSP similarity scores")
    parser.add_argument("references_filename",
                        type=str,
                        help="Path to reference MSP library.")
    parser.add_argument("queries_filename",
                        type=str,
                        help="Path to query spectra.")
    parser.add_argument("similarity_metric",
                        type=str,
                        help='Metric to use for matching.')
    parser.add_argument("output_filename_scores",
                        type=str,
                        help="Path where to store the output .csv scores.")
    parser.add_argument("output_filename_matches",
                        type=str,
                        help="Path where to store the output .csv matches.")
    parser.add_argument("tolerance",
                        type=float,
                        help="Tolerance to use for peak matching.")
    parser.add_argument(
        "mz_power",
        type=float,
        help="The power to raise mz to in the cosine function.")
    parser.add_argument(
        "intensity_power",
        type=float,
        help="The power to raise intensity to in the cosine function.")

    args = parser.parse_args()

    reference_spectra = load_from_msp(args.references_filename)
    queries_spectra = load_from_msp(args.queries_filename)

    if args.similarity_metric == 'CosineGreedy':
        similarity_metric = CosineGreedy(args.tolerance, args.mz_power,
                                         args.intensity_power)
    elif args.similarity_metric == 'CosineHungarian':
        similarity_metric = CosineHungarian(args.tolerance, args.mz_power,
                                            args.intensity_power)
    elif args.similarity_metric == 'ModifiedCosine':
        similarity_metric = ModifiedCosine(args.tolerance, args.mz_power,
                                           args.intensity_power)
        reference_spectra = map(add_precursor_mz, reference_spectra)
        queries_spectra = map(add_precursor_mz, queries_spectra)
    else:
        return -1

    scores = calculate_scores(
        references=list(reference_spectra),
        queries=list(queries_spectra),
        similarity_function=similarity_metric,
    )

    query_names = [spectra.metadata['name'] for spectra in scores.queries]
    reference_names = [
        spectra.metadata['name'] for spectra in scores.references
    ]

    # Write scores to dataframe
    dataframe_scores = DataFrame(
        data=[entry["score"] for entry in scores.scores],
        index=reference_names,
        columns=query_names)
    dataframe_scores.to_csv(args.output_filename_scores, sep=';')

    # Write number of matches to dataframe
    dataframe_matches = DataFrame(
        data=[entry["matches"] for entry in scores.scores],
        index=reference_names,
        columns=query_names)
    dataframe_matches.to_csv(args.output_filename_matches, sep=';')
    return 0