Esempio n. 1
0
def test_cosine_greedy_with_peak_powers():
    """Compare output cosine score with own calculation on simple dummy spectrums.
    Here testing the options to raise peak intensities to given powers.
    """
    mz_power = 0.5
    intensity_power = 2.0
    spectrum_1 = Spectrum(mz=numpy.array([100, 200, 300, 500, 510], dtype="float"),
                          intensities=numpy.array([0.1, 0.2, 1.0, 0.3, 0.4], dtype="float"))

    spectrum_2 = Spectrum(mz=numpy.array([100, 200, 290, 490, 510], dtype="float"),
                          intensities=numpy.array([0.1, 0.2, 1.0, 0.3, 0.4], dtype="float"))
    cosine_greedy = CosineGreedy(tolerance=1.0, mz_power=mz_power, intensity_power=intensity_power)
    score = cosine_greedy.pair(spectrum_1, spectrum_2)

    # Derive expected cosine score
    matches = [0, 1, 4]  # Those peaks have matching mz values (within given tolerance)
    intensity1 = spectrum_1.peaks.intensities
    mz1 = spectrum_1.peaks.mz
    intensity2 = spectrum_2.peaks.intensities
    mz2 = spectrum_2.peaks.mz
    multiply_matching_intensities = (mz1[matches] ** mz_power) * (intensity1[matches] ** intensity_power) \
        * (mz2[matches] ** mz_power) * (intensity2[matches] ** intensity_power)
    denominator = numpy.sqrt((((mz1 ** mz_power) * (intensity1 ** intensity_power)) ** 2).sum()) \
        * numpy.sqrt((((mz2 ** mz_power) * (intensity2 ** intensity_power)) ** 2).sum())
    expected_score = multiply_matching_intensities.sum() / denominator

    assert score["score"] == pytest.approx(expected_score, 0.0001), "Expected different cosine score."
    assert score["matches"] == len(matches), "Expected different number of matching peaks."
Esempio n. 2
0
def test_cosine_score_greedy_with_tolerance_2_0():
    """Compare output cosine score for tolerance 2.0 with own calculation on simple dummy spectrums."""
    spectrum_1 = Spectrum(mz=numpy.array([100, 299, 300, 301, 510],
                                         dtype="float"),
                          intensities=numpy.array([0.1, 1.0, 0.2, 0.3, 0.4],
                                                  dtype="float"))

    spectrum_2 = Spectrum(mz=numpy.array([100, 300, 301, 511], dtype="float"),
                          intensities=numpy.array([0.1, 1.0, 0.3, 0.4],
                                                  dtype="float"))
    cosine_greedy = CosineGreedy(tolerance=2.0)
    score, n_matches = cosine_greedy.pair(spectrum_1, spectrum_2)

    # Derive expected cosine score
    expected_matches = [[0, 1, 3, 4], [
        0, 1, 2, 3
    ]]  # Those peaks have matching mz values (within given tolerance)
    multiply_matching_intensities = spectrum_1.peaks.intensities[expected_matches[0]] \
        * spectrum_2.peaks.intensities[expected_matches[1]]
    denominator = numpy.sqrt((spectrum_1.peaks.intensities ** 2).sum()) \
        * numpy.sqrt((spectrum_2.peaks.intensities ** 2).sum())
    expected_score = multiply_matching_intensities.sum() / denominator

    assert score == pytest.approx(expected_score,
                                  0.0001), "Expected different cosine score."
    assert n_matches == len(
        expected_matches[0]), "Expected different number of matching peaks."
Esempio n. 3
0
def test_cosine_greedy_with_arrays_symmetric():
    """Test if matrix with is_symmetric=True works properly."""
    spectrum_1 = Spectrum(mz=numpy.array([100, 200, 300], dtype="float"),
                          intensities=numpy.array([0.1, 0.2, 1.0], dtype="float"))

    spectrum_2 = Spectrum(mz=numpy.array([110, 190, 290], dtype="float"),
                          intensities=numpy.array([0.5, 0.2, 1.0], dtype="float"))
    spectrums = [spectrum_1, spectrum_2]
    cosine_greedy = CosineGreedy()
    scores = cosine_greedy.matrix(spectrums, spectrums, is_symmetric=True)

    assert scores[0][0][0] == pytest.approx(scores[1][1][0], 0.000001), "Expected different cosine score."
    assert scores[0][1][0] == pytest.approx(scores[1][0][0], 0.000001), "Expected different cosine score."
Esempio n. 4
0
def get_hits(query_spec,
             library_spec,
             precursor_tol=1,
             metaKey='parent_mass',
             cosine_tol=0.1,
             decoys=False,
             passatutto=False,
             min_match_count=6):
    cosine = CosineGreedy(tolerance=cosine_tol)
    library_spec.sort(key=lambda x: getMeta(x)[metaKey])

    hits = []
    library_prec_list = [getMeta(x)[metaKey] for x in library_spec]
    for q_idx, q in enumerate(query_spec):
        if metaKey not in getMeta(q):
            continue
        min_mz = getMeta(q)[metaKey] - precursor_tol
        max_mz = getMeta(q)[metaKey] + precursor_tol
        pos = bisect.bisect_right(library_prec_list, min_mz)
        pos2 = pos
        while pos2 < len(
                library_prec_list) and library_prec_list[pos2] < max_mz:
            pos2 += 1
        # nothing in precursor range
        if pos == pos2:
            continue
        scores = []
        for l_idx in range(pos, pos2):
            l = library_spec[l_idx]
            score, match_count = cosine.pair(q, l).item()
            if score != score:
                print('got nan for', q.get('compound_name'),
                      l.get('compound_name'))
                continue
            if match_count >= min_match_count:
                scores.append((score, l))
        scores.sort(key=lambda x: x[0], reverse=True)
        if scores:
            score, target = scores[0]
            if decoys:
                hits.append(Hit(q, target, score, 'decoy'))
            else:
                if passatutto:
                    hits.append(
                        Hit(q, target, score,
                            passatutto_inchis_equal(q, target)))
                else:
                    hits.append(Hit(q, target, score, inchis_equal(q, target)))
    return hits
Esempio n. 5
0
def test_cosine_score_greedy_order_of_arguments():
    """Compare cosine scores for A,B versus B,A, which should give the same score."""
    spectrum_1 = Spectrum(mz=numpy.array([100, 200, 299, 300, 301, 500, 510], dtype="float"),
                          intensities=numpy.array([0.02, 0.02, 1.0, 0.2, 0.4, 0.04, 0.2], dtype="float"),
                          metadata=dict())

    spectrum_2 = Spectrum(mz=numpy.array([100, 200, 300, 301, 500, 512], dtype="float"),
                          intensities=numpy.array([0.02, 0.02, 1.0, 0.2, 0.04, 0.2], dtype="float"),
                          metadata=dict())

    cosine_greedy = CosineGreedy(tolerance=2.0)
    score_1_2 = cosine_greedy.pair(spectrum_1, spectrum_2)
    score_2_1 = cosine_greedy.pair(spectrum_2, spectrum_1)

    assert score_1_2["score"] == score_2_1["score"], "Expected that the order of the arguments would not matter."
    assert score_1_2 == score_2_1, "Expected that the order of the arguments would not matter."
Esempio n. 6
0
def test_scores_by_reference_sorted():
    "Test scores_by_reference method with sort=True."
    spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200.]),
                          intensities=numpy.array([0.7, 0.2, 0.1]),
                          metadata={'id': 'spectrum1'})
    spectrum_2 = Spectrum(mz=numpy.array([100, 140, 190.]),
                          intensities=numpy.array([0.4, 0.2, 0.1]),
                          metadata={'id': 'spectrum2'})
    spectrum_3 = Spectrum(mz=numpy.array([110, 140, 195.]),
                          intensities=numpy.array([0.6, 0.2, 0.1]),
                          metadata={'id': 'spectrum3'})
    spectrum_4 = Spectrum(mz=numpy.array([100, 150, 200.]),
                          intensities=numpy.array([0.6, 0.1, 0.6]),
                          metadata={'id': 'spectrum4'})
    references = [spectrum_1, spectrum_2, spectrum_3]
    queries = [spectrum_3, spectrum_4, spectrum_2]

    scores = calculate_scores(references, queries, CosineGreedy())
    selected_scores = scores.scores_by_reference(spectrum_2, sort=True)

    expected_result = [(scores.queries[i], scores.scores[1, i])
                       for i in [2, 1, 0]]
    assert selected_scores == expected_result, "Expected different scores."
    scores_only = numpy.array([x[1]["score"] for x in selected_scores])
    scores_expected = numpy.array(
        [1.0, 0.6129713330865563, 0.1363196353181994])
    assert numpy.allclose(scores_only, scores_expected, atol=1e-8), \
        "Expected different sorted scores."
Esempio n. 7
0
def test_cosine_greedy_without_parameters():
    """Compare output cosine score with own calculation on simple dummy spectrums."""
    spectrum_1 = Spectrum(mz=numpy.array([100, 200, 300, 500, 510], dtype="float"),
                          intensities=numpy.array([0.1, 0.2, 1.0, 0.3, 0.4], dtype="float"))

    spectrum_2 = Spectrum(mz=numpy.array([100, 200, 290, 490, 510], dtype="float"),
                          intensities=numpy.array([0.1, 0.2, 1.0, 0.3, 0.4], dtype="float"))
    cosine_greedy = CosineGreedy()
    score = cosine_greedy.pair(spectrum_1, spectrum_2)

    # Derive expected cosine score
    expected_matches = [0, 1, 4]  # Those peaks have matching mz values (within given tolerance)
    multiply_matching_intensities = spectrum_1.peaks.intensities[expected_matches] \
        * spectrum_2.peaks.intensities[expected_matches]
    denominator = numpy.sqrt((spectrum_1.peaks.intensities ** 2).sum()) \
        * numpy.sqrt((spectrum_2.peaks.intensities ** 2).sum())
    expected_score = multiply_matching_intensities.sum() / denominator

    assert score["score"] == pytest.approx(expected_score, 0.0001), "Expected different cosine score."
    assert score["matches"] == len(expected_matches), "Expected different number of matching peaks."
Esempio n. 8
0
def test_user_workflow():

    def apply_my_filters(s):
        s = default_filters(s)
        s = add_parent_mass(s)
        s = normalize_intensities(s)
        s = select_by_relative_intensity(s, intensity_from=0.0, intensity_to=1.0)
        s = select_by_mz(s, mz_from=0, mz_to=1000)
        s = require_minimum_number_of_peaks(s, n_required=5)
        return s

    module_root = os.path.join(os.path.dirname(__file__), "..")
    spectrums_file = os.path.join(module_root, "tests", "pesticides.mgf")

    # apply my filters to the data
    spectrums = [apply_my_filters(s) for s in load_from_mgf(spectrums_file)]

    # omit spectrums that didn't qualify for analysis
    spectrums = [s for s in spectrums if s is not None]

    # this will be a library grouping analysis, so queries = references = spectrums
    queries = spectrums[:]
    references = spectrums[:]

    # define similarity function
    cosine_greedy = CosineGreedy(tolerance=0.3)

    # calculate_scores
    scores = list(calculate_scores(references,
                                   queries,
                                   cosine_greedy))

    # filter out self-comparisons, require at least 20 matching peaks:
    filtered = [(reference, query, score, n_matching) for (reference, query, score, n_matching) in scores
                if reference != query and n_matching >= 20]

    sorted_by_score = sorted(filtered, key=lambda elem: elem[2], reverse=True)

    actual_top10 = sorted_by_score[:10]

    expected_top10 = [
        (references[48], queries[50], pytest.approx(0.9994783627790965, rel=1e-9), 25),
        (references[50], queries[48], pytest.approx(0.9994783627790965, rel=1e-9), 25),
        (references[46], queries[48], pytest.approx(0.9990141860269471, rel=1e-9), 27),
        (references[48], queries[46], pytest.approx(0.9990141860269471, rel=1e-9), 27),
        (references[46], queries[50], pytest.approx(0.9988793406908719, rel=1e-9), 22),
        (references[50], queries[46], pytest.approx(0.9988793406908719, rel=1e-9), 22),
        (references[57], queries[59], pytest.approx(0.9982171275552505, rel=1e-9), 46),
        (references[59], queries[57], pytest.approx(0.9982171275552505, rel=1e-9), 46),
        (references[73], queries[74], pytest.approx(0.9973823244169199, rel=1e-9), 23),
        (references[74], queries[73], pytest.approx(0.9973823244169199, rel=1e-9), 23),
    ]
    assert actual_top10 == expected_top10
Esempio n. 9
0
def test_scores_by_query():
    "Test scores_by_query method."
    spectrum_1, spectrum_2, spectrum_3, spectrum_4 = spectra()
    references = [spectrum_1, spectrum_2, spectrum_3]
    queries = [spectrum_2, spectrum_3, spectrum_4]

    scores = calculate_scores(references, queries, CosineGreedy())
    selected_scores = scores.scores_by_query(spectrum_4)

    expected_result = [(scores.references[i], scores.scores[i, 2])
                       for i in range(3)]
    assert selected_scores == expected_result, "Expected different scores."
Esempio n. 10
0
def test_cosine_greedy_pair(peaks, tolerance, mz_power, intensity_power,
                            expected_matches):
    builder = SpectrumBuilder()
    spectrum_1 = builder.with_mz(peaks[0][0]).with_intensities(
        peaks[0][1]).build()
    spectrum_2 = builder.with_mz(peaks[1][0]).with_intensities(
        peaks[1][1]).build()

    cosine_greedy = CosineGreedy(tolerance=tolerance,
                                 mz_power=mz_power,
                                 intensity_power=intensity_power)
    score = cosine_greedy.pair(spectrum_1, spectrum_2)

    expected_score = compute_expected_score(mz_power, intensity_power,
                                            spectrum_1, spectrum_2,
                                            expected_matches)

    assert score["score"] == pytest.approx(
        expected_score, 0.0001), "Expected different cosine score."
    assert score["matches"] == len(
        expected_matches[0]), "Expected different number of matching peaks."
Esempio n. 11
0
def test_cosine_greedy_matrix(symmetric):
    builder = SpectrumBuilder()
    spectrum_1 = builder.with_mz(numpy.array(
        [100, 200, 300], dtype="float")).with_intensities(
            numpy.array([0.1, 0.2, 1.0], dtype="float")).build()

    spectrum_2 = builder.with_mz(numpy.array(
        [110, 190, 290], dtype="float")).with_intensities(
            numpy.array([0.5, 0.2, 1.0], dtype="float")).build()

    spectrums = [spectrum_1, spectrum_2]
    cosine_greedy = CosineGreedy()
    scores = cosine_greedy.matrix(spectrums, spectrums, is_symmetric=symmetric)

    assert scores[0][0][0] == pytest.approx(
        scores[1][1][0], 0.000001), "Expected different cosine score."
    assert scores[0][0]["score"] == pytest.approx(scores[1][1]["score"], 0.000001), \
        "Expected different cosine score."
    assert scores[0][1][0] == pytest.approx(
        scores[1][0][0], 0.000001), "Expected different cosine score."
    assert scores[0][1]["score"] == pytest.approx(scores[1][0]["score"], 0.000001), \
        "Expected different cosine score."
def return_list_cosine_scores(query, library, type):
         
    if(type != "library" and type != "decoy"):
        print("library type parameter must be either library or decoy")
        return False
    else: 
        cosine_greedy = CosineGreedy(tolerance=0.2)
        counter = 1
        scores = []
        average_matches = 0
        milestone = 1

        if(type == "decoy"):        
            for spec in query:
                prelim_scores = []
                for d in library:
                    score, n_matches = cosine_greedy(d, spec)
                    average_matches = average_matches + n_matches
                    newscore = CosineHit(score, type, spec, d)
                    prelim_scores.append(newscore)          

                prelim_scores = sorted(prelim_scores)
                scores.append(prelim_scores[-1])


        if(type == "library"):
            for spec in query:
                prelim_scores = []
                for d in library:
                    if(are_peaks_similar(spec.metadata['precursor_mz'], d.metadata['precursor_mz']) == True):
                        score, n_matches = cosine_greedy(d, spec)
                        average_matches = average_matches + n_matches
                        newscore = CosineHit(score, type, spec, d)
                        prelim_scores.append(newscore)  
                    else:
                        newscore = CosineHit(0, type, spec, d)

                prelim_scores = sorted(prelim_scores)
#                    print("Scores are ")
#                    for s in prelim_scores:
#                        print(s.score)
#                    for s in prelim_scores: 
#                        if(are_spectrums_same(s.query, s.library) == True):
#                            print("true score is",  s.score)
                scores.append(prelim_scores[-1])
#                    print("Score taken: ",prelim_scores[-1].score )



        return scores
Esempio n. 13
0
def test_scores_by_reference_sorted():
    "Test scores_by_reference method with sort=True."
    spectrum_1, spectrum_2, spectrum_3, spectrum_4 = spectra()
    references = [spectrum_1, spectrum_2, spectrum_3]
    queries = [spectrum_3, spectrum_4, spectrum_2]

    scores = calculate_scores(references, queries, CosineGreedy())
    selected_scores = scores.scores_by_reference(spectrum_2, sort=True)

    expected_result = [(scores.queries[i], scores.scores[1, i])
                       for i in [2, 1, 0]]
    assert selected_scores == expected_result, "Expected different scores."
    scores_only = numpy.array([x[1]["score"] for x in selected_scores])
    scores_expected = numpy.array(
        [1.0, 0.6129713330865563, 0.1363196353181994])
    assert numpy.allclose(scores_only, scores_expected, atol=1e-8), \
        "Expected different sorted scores."
Esempio n. 14
0
def test_cosine_greedy_without_parameters():

    spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200, 300, 500, 510, 1100],
                                         dtype="float"),
                          intensities=numpy.array(
                              [700, 200, 100, 1000, 200, 5, 500],
                              dtype="float"))

    spectrum_2 = Spectrum(mz=numpy.array([100, 140, 190, 300, 490, 510, 1090],
                                         dtype="float"),
                          intensities=numpy.array(
                              [700, 200, 100, 1000, 200, 5, 500],
                              dtype="float"))

    norm_spectrum_1 = normalize_intensities(spectrum_1)
    norm_spectrum_2 = normalize_intensities(spectrum_2)
    cosine_greedy = CosineGreedy()
    score, n_matches = cosine_greedy(norm_spectrum_1, norm_spectrum_2)

    assert score == pytest.approx(0.81421,
                                  0.0001), "Expected different cosine score."
    assert n_matches == 3
Esempio n. 15
0
def test_cosine_score_greedy_with_tolerance_2_0():

    spectrum_1 = Spectrum(mz=numpy.array([100, 200, 299, 300, 301, 500, 510],
                                         dtype="float"),
                          intensities=numpy.array(
                              [10, 10, 500, 100, 200, 20, 100], dtype="float"),
                          metadata=dict())

    spectrum_2 = Spectrum(mz=numpy.array([100, 200, 300, 301, 500, 512],
                                         dtype="float"),
                          intensities=numpy.array([10, 10, 500, 100, 20, 100],
                                                  dtype="float"),
                          metadata=dict())

    norm_spectrum_1 = normalize_intensities(spectrum_1)
    norm_spectrum_2 = normalize_intensities(spectrum_2)
    cosine_greedy = CosineGreedy(tolerance=2.0)
    score, n_matches = cosine_greedy(norm_spectrum_1, norm_spectrum_2)

    assert score == pytest.approx(0.903412,
                                  0.0001), "Expected different cosine score."
    assert n_matches == 6
Esempio n. 16
0
def test_scores_by_query():
    "Test scores_by_query method."
    spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200.]),
                          intensities=numpy.array([0.7, 0.2, 0.1]),
                          metadata={'id': 'spectrum1'})
    spectrum_2 = Spectrum(mz=numpy.array([100, 140, 190.]),
                          intensities=numpy.array([0.4, 0.2, 0.1]),
                          metadata={'id': 'spectrum2'})
    spectrum_3 = Spectrum(mz=numpy.array([110, 140, 195.]),
                          intensities=numpy.array([0.6, 0.2, 0.1]),
                          metadata={'id': 'spectrum3'})
    spectrum_4 = Spectrum(mz=numpy.array([100, 150, 200.]),
                          intensities=numpy.array([0.6, 0.1, 0.6]),
                          metadata={'id': 'spectrum4'})
    references = [spectrum_1, spectrum_2, spectrum_3]
    queries = [spectrum_2, spectrum_3, spectrum_4]

    scores = Scores(references, queries, CosineGreedy()).calculate()
    selected_scores = scores.scores_by_query(spectrum_4)

    expected_result = [(scores.references[i], *scores.scores[i, 2]) for i in range(3)]
    assert selected_scores == expected_result, "Expected different scores."
Esempio n. 17
0
def test_cosine_score_greedy_order_of_arguments():

    spectrum_1 = Spectrum(mz=numpy.array([100, 200, 299, 300, 301, 500, 510],
                                         dtype="float"),
                          intensities=numpy.array(
                              [10, 10, 500, 100, 200, 20, 100], dtype="float"),
                          metadata=dict())

    spectrum_2 = Spectrum(mz=numpy.array([100, 200, 300, 301, 500, 512],
                                         dtype="float"),
                          intensities=numpy.array([10, 10, 500, 100, 20, 100],
                                                  dtype="float"),
                          metadata=dict())

    norm_spectrum_1 = normalize_intensities(spectrum_1)
    norm_spectrum_2 = normalize_intensities(spectrum_2)

    cosine_greedy = CosineGreedy(tolerance=2.0)
    score_1_2, n_matches_1_2 = cosine_greedy(norm_spectrum_1, norm_spectrum_2)
    score_2_1, n_matches_2_1 = cosine_greedy(norm_spectrum_2, norm_spectrum_1)

    assert score_1_2 == score_2_1, "Expected that the order of the arguments would not matter."
    assert n_matches_1_2 == n_matches_2_1, "Expected that the order of the arguments would not matter."
Esempio n. 18
0
def test_cosine_score_greedy_with_tolerance_0_2():
    spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200, 300, 500, 510, 1100],
                                         dtype="float"),
                          intensities=numpy.array(
                              [700, 200, 100, 1000, 200, 5, 500],
                              dtype="float"),
                          metadata=dict())

    spectrum_2 = Spectrum(
        mz=numpy.array([50, 100, 200, 299.5, 489.5, 510.5, 1040],
                       dtype="float"),
        intensities=numpy.array([700, 200, 100, 1000, 200, 5, 500],
                                dtype="float"),
        metadata=dict())

    norm_spectrum_1 = normalize_intensities(spectrum_1)
    norm_spectrum_2 = normalize_intensities(spectrum_2)
    cosine_greedy = CosineGreedy(tolerance=0.2)
    score, n_matches = cosine_greedy(norm_spectrum_1, norm_spectrum_2)

    assert score == pytest.approx(0.081966,
                                  0.0001), "Expected different cosine score."
    assert n_matches == 2
Esempio n. 19
0
def test_scores_by_query_sorted():
    "Test scores_by_query method with sort=True."
    builder = SpectrumBuilder()
    spectrum_1 = builder.with_mz(numpy.array(
        [100, 150, 200.])).with_intensities(numpy.array([0.7, 0.2,
                                                         0.1])).with_metadata({
                                                             'id':
                                                             'spectrum1'
                                                         }).build()
    spectrum_2 = builder.with_mz(numpy.array(
        [100, 140, 190.])).with_intensities(numpy.array([0.4, 0.2,
                                                         0.1])).with_metadata({
                                                             'id':
                                                             'spectrum2'
                                                         }).build()
    spectrum_3 = builder.with_mz(numpy.array(
        [100, 140, 195.])).with_intensities(numpy.array([0.6, 0.2,
                                                         0.1])).with_metadata({
                                                             'id':
                                                             'spectrum3'
                                                         }).build()
    spectrum_4 = builder.with_mz(numpy.array(
        [100, 150, 200.])).with_intensities(numpy.array([0.6, 0.1,
                                                         0.6])).with_metadata({
                                                             'id':
                                                             'spectrum4'
                                                         }).build()

    references = [spectrum_1, spectrum_2, spectrum_3]
    queries = [spectrum_2, spectrum_3, spectrum_4]

    scores = calculate_scores(references, queries, CosineGreedy())
    selected_scores = scores.scores_by_query(spectrum_4, sort=True)

    expected_result = [(scores.references[i], scores.scores[i, 2])
                       for i in [0, 2, 1]]
    assert selected_scores == expected_result, "Expected different scores."
def library_matching(documents_query: List[SpectrumDocument],
                     documents_library: List[SpectrumDocument],
                     model,
                     presearch_based_on=["parentmass", "spec2vec-top10"],
                     ignore_non_annotated: bool = True,
                     include_scores=["spec2vec", "cosine", "modcosine"],
                     intensity_weighting_power: float = 0.5,
                     allowed_missing_percentage: float = 0,
                     cosine_tol: float = 0.005,
                     mass_tolerance: float = 1.0):
    """Selecting potential spectra matches with spectra library.

    Suitable candidates will be selected by 1) top_n Spec2Vec similarity, and 2)
    same precursor mass (within given mz_ppm tolerance(s)).
    For later matching routines, additional scores (cosine, modified cosine)
    are added as well.

    Args:
    --------
    documents_query:
        List containing all spectrum documents that should be queried against the library.
    documents_library:
        List containing all library spectrum documents.
    model:
        Pretrained word2Vec model.
    top_n: int, optional
        Number of entries witht the top_n highest Spec2Vec scores to keep as
        found matches. Default = 10.
    ignore_non_annotated: bool, optional
        If True, only annotated spectra will be considered for matching.
        Default = True.
    cosine_tol: float, optional
        Set tolerance for the cosine and modified cosine score. Default = 0.005
    mass_tolerance
        Specify tolerance for a parentmass match.
    """

    # Initializations
    found_matches = []
    m_mass_matches = None
    m_spec2vec_similarities = None

    def get_metadata(documents):
        metadata = []
        for doc in documents:
            metadata.append(doc._obj.get("smiles"))
        return metadata

    library_spectra_metadata = get_metadata(documents_library)
    if ignore_non_annotated:
        # Get array of all ids for spectra with smiles
        library_ids = np.asarray(
            [i for i, x in enumerate(library_spectra_metadata) if x])
    else:
        library_ids = np.arange(len(documents_library))

    msg = "Presearch must be done either by 'parentmass' and/or 'spec2vec-topX'"
    assert "parentmass" in presearch_based_on or np.any(
        ["spec2vec" in x for x in presearch_based_on]), msg

    # 1. Search for top-n Spec2Vec matches ------------------------------------
    if np.any(["spec2vec" in x for x in presearch_based_on]):
        top_n = int([
            x.split("top")[1] for x in presearch_based_on if "spec2vec" in x
        ][0])
        print("Pre-selection includes spec2vec top {}.".format(top_n))
        spec2vec = Spec2Vec(
            model=model,
            intensity_weighting_power=intensity_weighting_power,
            allowed_missing_percentage=allowed_missing_percentage)
        m_spec2vec_similarities = spec2vec.matrix(
            [documents_library[i] for i in library_ids], documents_query)

        # Select top_n similarity values:
        selection_spec2vec = np.argpartition(m_spec2vec_similarities,
                                             -top_n,
                                             axis=0)[-top_n:, :]
    else:
        selection_spec2vec = np.empty((0, len(documents_query)), dtype="int")

    # 2. Search for parent mass based matches ---------------------------------
    if "parentmass" in presearch_based_on:
        mass_matching = ParentmassMatch(mass_tolerance)
        m_mass_matches = mass_matching.matrix(
            [documents_library[i]._obj for i in library_ids],
            [x._obj for x in documents_query])
        selection_massmatch = []
        for i in range(len(documents_query)):
            selection_massmatch.append(np.where(m_mass_matches[:, i] == 1)[0])
    else:
        selection_massmatch = np.empty((len(documents_query), 0), dtype="int")

    # 3. Combine found matches ------------------------------------------------
    for i in range(len(documents_query)):
        s2v_top_ids = selection_spec2vec[:, i]
        mass_match_ids = selection_massmatch[i]

        all_match_ids = np.unique(np.concatenate(
            (s2v_top_ids, mass_match_ids)))

        if len(all_match_ids) > 0:
            if "modcosine" in include_scores:
                # Get cosine score for found matches
                cosine_similarity = CosineGreedy(tolerance=cosine_tol)
                cosine_scores = []
                for match_id in library_ids[all_match_ids]:
                    cosine_scores.append(
                        cosine_similarity.matrix(
                            documents_library[match_id]._obj,
                            documents_query[i]._obj))
            else:
                cosine_scores = len(all_match_ids) * ["not calculated"]

            if "cosine" in include_scores:
                # Get modified cosine score for found matches
                mod_cosine_similarity = ModifiedCosine(tolerance=cosine_tol)
                mod_cosine_scores = []
                for match_id in library_ids[all_match_ids]:
                    mod_cosine_scores.append(
                        mod_cosine_similarity.matrix(
                            documents_library[match_id]._obj,
                            documents_query[i]._obj))
            else:
                mod_cosine_scores = len(all_match_ids) * ["not calculated"]

            matches_df = pd.DataFrame(
                {
                    "cosine_score": [x[0] for x in cosine_scores],
                    "cosine_matches": [x[1] for x in cosine_scores],
                    "mod_cosine_score": [x[0] for x in mod_cosine_scores],
                    "mod_cosine_matches": [x[1] for x in mod_cosine_scores]
                },
                index=library_ids[all_match_ids])

            if m_mass_matches is not None:
                matches_df["mass_match"] = m_mass_matches[all_match_ids, i]

            if m_spec2vec_similarities is not None:
                matches_df["s2v_score"] = m_spec2vec_similarities[
                    all_match_ids, i]
            elif "spec2vec" in include_scores:
                spec2vec_similarity = Spec2Vec(
                    model=model,
                    intensity_weighting_power=intensity_weighting_power,
                    allowed_missing_percentage=allowed_missing_percentage)
                spec2vec_scores = []
                for match_id in library_ids[all_match_ids]:
                    spec2vec_scores.append(
                        spec2vec_similarity.pair(documents_library[match_id],
                                                 documents_query[i]))
                matches_df["s2v_score"] = spec2vec_scores
            found_matches.append(matches_df.fillna(0))
        else:
            found_matches.append([])

    return found_matches
Esempio n. 21
0
def main(argv):
    parser = argparse.ArgumentParser(
        description="Compute MSP similarity scores")
    parser.add_argument("-s",
                        dest="symmetric",
                        action='store_true',
                        help="Computation is symmetric.")
    parser.add_argument("--ref",
                        dest="references_filename",
                        type=str,
                        help="Path to reference spectra library.")
    parser.add_argument("--ref_format",
                        dest="references_format",
                        type=str,
                        help="Reference spectra library file format.")
    parser.add_argument("queries_filename",
                        type=str,
                        help="Path to query spectra.")
    parser.add_argument("queries_format",
                        type=str,
                        help="Query spectra file format.")
    parser.add_argument("similarity_metric",
                        type=str,
                        help='Metric to use for matching.')
    parser.add_argument("tolerance",
                        type=float,
                        help="Tolerance to use for peak matching.")
    parser.add_argument(
        "mz_power",
        type=float,
        help="The power to raise mz to in the cosine function.")
    parser.add_argument(
        "intensity_power",
        type=float,
        help="The power to raise intensity to in the cosine function.")
    parser.add_argument("output_filename_scores",
                        type=str,
                        help="Path where to store the output .tsv scores.")
    parser.add_argument("output_filename_matches",
                        type=str,
                        help="Path where to store the output .tsv matches.")
    args = parser.parse_args()

    if args.queries_format == 'msp':
        queries_spectra = list(load_from_msp(args.queries_filename))
    elif args.queries_format == 'mgf':
        queries_spectra = list(load_from_mgf(args.queries_filename))
    else:
        raise ValueError(
            f'File format {args.queries_format} not supported for query spectra.'
        )

    if args.symmetric:
        reference_spectra = []
    else:
        if args.references_format == 'msp':
            reference_spectra = list(load_from_msp(args.references_filename))
        elif args.references_format == 'mgf':
            reference_spectra = list(load_from_mgf(args.references_filename))
        else:
            raise ValueError(
                f'File format {args.references_format} not supported for reference spectra library.'
            )

    if args.similarity_metric == 'CosineGreedy':
        similarity_metric = CosineGreedy(args.tolerance, args.mz_power,
                                         args.intensity_power)
    elif args.similarity_metric == 'CosineHungarian':
        similarity_metric = CosineHungarian(args.tolerance, args.mz_power,
                                            args.intensity_power)
    elif args.similarity_metric == 'ModifiedCosine':
        similarity_metric = ModifiedCosine(args.tolerance, args.mz_power,
                                           args.intensity_power)
        reference_spectra = list(map(convert_precursor_mz, reference_spectra))
        queries_spectra = list(map(convert_precursor_mz, queries_spectra))
    else:
        return -1

    print("Calculating scores...")
    scores = calculate_scores(
        references=queries_spectra if args.symmetric else reference_spectra,
        queries=queries_spectra,
        similarity_function=similarity_metric,
        is_symmetric=args.symmetric)

    write_outputs(args, scores)
    return 0
Esempio n. 22
0
def test_user_workflow():
    def apply_my_filters(s):
        s = default_filters(s)
        s = add_parent_mass(s)
        s = normalize_intensities(s)
        s = select_by_relative_intensity(s,
                                         intensity_from=0.0,
                                         intensity_to=1.0)
        s = select_by_mz(s, mz_from=0, mz_to=1000)
        s = require_minimum_number_of_peaks(s, n_required=5)
        return s

    module_root = os.path.join(os.path.dirname(__file__), '..')
    spectrums_file = os.path.join(module_root, 'tests', 'pesticides.mgf')

    # apply my filters to the data
    spectrums = [apply_my_filters(s) for s in load_from_mgf(spectrums_file)]

    # omit spectrums that didn't qualify for analysis
    spectrums = [s for s in spectrums if s is not None]

    # this will be a library grouping analysis, so queries = references = spectrums
    queries = spectrums[:]
    references = spectrums[:]

    # define similarity function
    cosine_greedy = CosineGreedy()

    # calculate_scores
    scores = list(calculate_scores(references, queries, cosine_greedy))

    # filter out self-comparisons, require at least 20 matching peaks:
    filtered = [(reference, query, score, n_matching)
                for (reference, query, score, n_matching) in scores
                if reference != query and n_matching >= 20]

    sorted_by_score = sorted(filtered, key=lambda elem: elem[2], reverse=True)

    actual_top10 = sorted_by_score[:10]

    expected_top10 = [
        (references[48], queries[50],
         pytest.approx(0.9994510368270997, rel=1e-9), 25),
        (references[50], queries[48],
         pytest.approx(0.9994510368270997, rel=1e-9), 25),
        (references[46], queries[48],
         pytest.approx(0.9981252309590571, rel=1e-9), 27),
        (references[48], queries[46],
         pytest.approx(0.9981252309590571, rel=1e-9), 27),
        (references[46], queries[50],
         pytest.approx(0.9979632203390496, rel=1e-9), 22),
        (references[50], queries[46],
         pytest.approx(0.9979632203390496, rel=1e-9), 22),
        (references[73], queries[74],
         pytest.approx(0.9956795920716246, rel=1e-9), 23),
        (references[74], queries[73],
         pytest.approx(0.9956795920716246, rel=1e-9), 23),
        (references[57], queries[59],
         pytest.approx(0.9886557001269415, rel=1e-9), 46),
        (references[59], queries[57],
         pytest.approx(0.9886557001269415, rel=1e-9), 46),
    ]
    assert actual_top10 == expected_top10
Esempio n. 23
0
def main(argv):
    parser = argparse.ArgumentParser(
        description="Compute MSP similarity scores")
    parser.add_argument("-f",
                        dest="default_filters",
                        action='store_true',
                        help="Apply default filters")
    parser.add_argument("-n",
                        dest="normalize_intensities",
                        action='store_true',
                        help="Normalize intensities.")
    parser.add_argument("-s",
                        dest="symmetric",
                        action='store_true',
                        help="Computation is symmetric.")
    parser.add_argument("--ref",
                        dest="references_filename",
                        type=str,
                        help="Path to reference MSP library.")
    parser.add_argument("queries_filename",
                        type=str,
                        help="Path to query spectra.")
    parser.add_argument("similarity_metric",
                        type=str,
                        help='Metric to use for matching.')
    parser.add_argument("tolerance",
                        type=float,
                        help="Tolerance to use for peak matching.")
    parser.add_argument(
        "mz_power",
        type=float,
        help="The power to raise mz to in the cosine function.")
    parser.add_argument(
        "intensity_power",
        type=float,
        help="The power to raise intensity to in the cosine function.")
    parser.add_argument("output_filename_scores",
                        type=str,
                        help="Path where to store the output .csv scores.")
    parser.add_argument("output_filename_matches",
                        type=str,
                        help="Path where to store the output .csv matches.")
    args = parser.parse_args()

    queries_spectra = list(load_from_msp(args.queries_filename))
    if args.symmetric:
        reference_spectra = []
    else:
        reference_spectra = list(load_from_msp(args.references_filename))

    if args.default_filters is True:
        print("Applying default filters...")
        queries_spectra = list(map(default_filters, queries_spectra))
        reference_spectra = list(map(default_filters, reference_spectra))

    if args.normalize_intensities is True:
        print("Normalizing intensities...")
        queries_spectra = list(map(normalize_intensities, queries_spectra))
        reference_spectra = list(map(normalize_intensities, reference_spectra))

    if args.similarity_metric == 'CosineGreedy':
        similarity_metric = CosineGreedy(args.tolerance, args.mz_power,
                                         args.intensity_power)
    elif args.similarity_metric == 'CosineHungarian':
        similarity_metric = CosineHungarian(args.tolerance, args.mz_power,
                                            args.intensity_power)
    elif args.similarity_metric == 'ModifiedCosine':
        similarity_metric = ModifiedCosine(args.tolerance, args.mz_power,
                                           args.intensity_power)
        reference_spectra = list(map(add_precursor_mz, reference_spectra))
        queries_spectra = list(map(add_precursor_mz, queries_spectra))
    else:
        return -1

    print("Calculating scores...")
    scores = calculate_scores(
        references=queries_spectra if args.symmetric else reference_spectra,
        queries=queries_spectra,
        similarity_function=similarity_metric,
        is_symmetric=args.symmetric)

    write_outputs(args, scores)
    return 0
def library_matching(
        documents_query: List[SpectrumDocument],
        documents_library: List[SpectrumDocument],
        model: BaseTopicModel,
        presearch_based_on: List[str] = ["precursor_mz", "spec2vec-top10"],
        ignore_non_annotated: bool = True,
        include_scores=["spec2vec", "cosine", "modcosine"],
        intensity_weighting_power: float = 0.5,
        allowed_missing_percentage: float = 0,
        cosine_tol: float = 0.005,
        min_matches: int = 6,
        mass_tolerance: float = 2.0,
        mass_tolerance_type: str = "ppm"):
    """Selecting potential spectra matches with spectra library.

    Suitable candidates will be selected by 1) top_n Spec2Vec similarity, and 2)
    same precursor mass (within given mz_ppm tolerance(s)).
    For later matching routines, additional scores (cosine, modified cosine)
    are added as well.

    Args:
    --------
    documents_query:
        List containing all spectrum documents that should be queried against the library.
    documents_library:
        List containing all library spectrum documents.
    model:
        Pretrained word2Vec model.
    presearch_based_on:
        List with strings to specify which measures to use for the presearch.
        This can include 'precursor_mz', 'spec2vec-topX',
    ignore_non_annotated: bool, optional
        If True, only annotated spectra will be considered for matching.
        Default = True.
    cosine_tol: float, optional
        Set tolerance for the cosine and modified cosine score. Default = 0.005
    mass_tolerance
        Specify tolerance for a mass match.
    mass_toleramce_type
        Chose between "ppm" (relative) and "Dalton" (absolute) tolerance type.
    """

    # Initializations
    found_matches = []
    m_mass_matches = None
    m_spec2vec_similarities = None
    m_modcos_similarities = None

    def get_metadata(documents):
        metadata = []
        for doc in documents:
            metadata.append(doc._obj.get("smiles"))
        return metadata

    library_spectra_metadata = get_metadata(documents_library)
    if ignore_non_annotated:
        # Get array of all ids for spectra with smiles
        library_ids = np.asarray(
            [i for i, x in enumerate(library_spectra_metadata) if x])
    else:
        library_ids = np.arange(len(documents_library))

    allowed_presearch_type = ["precursor_mz", "spec2vec-top", "modcos-top"]
    msg = "Presearch must include one of: " + ", ".join(allowed_presearch_type)
    assert np.any([(x in y) for x in allowed_presearch_type
                   for y in presearch_based_on]), msg

    # 1. Search for top-n Spec2Vec matches ------------------------------------
    if np.any(["spec2vec" in x for x in presearch_based_on]):
        top_n = int([
            x.split("top")[1] for x in presearch_based_on if "spec2vec" in x
        ][0])
        print(f"Pre-selection includes spec2vec top {top_n}.")
        spec2vec = Spec2Vec(
            model=model,
            intensity_weighting_power=intensity_weighting_power,
            allowed_missing_percentage=allowed_missing_percentage,
            progress_bar=True)
        m_spec2vec_similarities = spec2vec.matrix(
            [documents_library[i] for i in library_ids], documents_query)

        # Select top_n similarity values:
        selection_spec2vec = np.argpartition(m_spec2vec_similarities,
                                             -top_n,
                                             axis=0)[-top_n:, :]
    else:
        selection_spec2vec = np.empty((0, len(documents_query)), dtype="int")

    # 2. Search for precursor_mz based matches ---------------------------------
    if "precursor_mz" in presearch_based_on:
        print(
            f"Pre-selection includes mass matches within {mass_tolerance} {mass_tolerance_type}."
        )
        mass_matching = PrecursorMzMatch(tolerance=mass_tolerance,
                                         tolerance_type=mass_tolerance_type)
        m_mass_matches = mass_matching.matrix(
            [documents_library[i]._obj for i in library_ids],
            [x._obj for x in documents_query])
        selection_massmatch = []
        for i in range(len(documents_query)):
            selection_massmatch.append(np.where(m_mass_matches[:, i] == 1)[0])
    else:
        selection_massmatch = np.empty((len(documents_query), 0), dtype="int")

    # 3. Search for top-n modified cosine matches ------------------------------------
    if np.any(["modcos" in x for x in presearch_based_on]):
        top_n = int([
            x.split("top")[1] for x in presearch_based_on if "modcos" in x
        ][0])
        print(f"Pre-selection includes modified cosine top {top_n}.")
        modcos = ModifiedCosine(tolerance=cosine_tol)

        n_rows = len(library_ids)
        n_cols = len(documents_query)
        m_modcos_similarities = np.zeros([n_rows, n_cols], dtype=np.float64)
        m_modcos_matches = np.zeros([n_rows, n_cols], dtype=np.float64)
        for i_ref, reference in enumerate(
                tqdm([documents_library[i]._obj for i in library_ids])):
            for i_query, query in enumerate([x._obj for x in documents_query]):
                score = modcos.pair(reference, query)
                m_modcos_similarities[i_ref][i_query] = score[0]
                m_modcos_matches[i_ref][i_query] = score[1]

        # Select top_n similarity values:
        m_modcos_selected = m_modcos_similarities.copy()
        m_modcos_selected[m_modcos_matches < min_matches] = 0
        selection_modcos = np.argpartition(m_modcos_selected, -top_n,
                                           axis=0)[-top_n:, :]
    else:
        selection_modcos = np.empty((0, len(documents_query)), dtype="int")

    # 4. Combine found matches ------------------------------------------------
    if "cosine" in include_scores:
        print("Calculate cosine score for selected candidates.")
    if "modcosine" in include_scores:
        print("Calculate modified cosine score for selected candidates.")

    for i in tqdm(range(len(documents_query))):
        s2v_top_ids = selection_spec2vec[:, i]
        mass_match_ids = selection_massmatch[i]
        modcos_ids = selection_modcos[:, i]

        all_match_ids = np.unique(
            np.concatenate((s2v_top_ids, mass_match_ids, modcos_ids)))

        if len(all_match_ids) > 0:
            if "cosine" in include_scores:
                # Get cosine score for found matches
                cosine_similarity = CosineGreedy(tolerance=cosine_tol)
                cosine_scores = []
                for match_id in library_ids[all_match_ids]:
                    cosine_scores.append(
                        cosine_similarity.pair(
                            documents_library[match_id]._obj,
                            documents_query[i]._obj))
            else:
                cosine_scores = len(all_match_ids) * ["not calculated"]

            if m_modcos_similarities is not None:
                mod_cosine_scores0 = [
                    x for x in m_modcos_similarities[all_match_ids, i]
                ]
                mod_cosine_scores1 = [
                    x for x in m_modcos_matches[all_match_ids, i]
                ]
                mod_cosine_scores = list(
                    zip(mod_cosine_scores0, mod_cosine_scores1))
            elif "modcosine" in include_scores:
                # Get modified cosine score for found matches
                mod_cosine_similarity = ModifiedCosine(tolerance=cosine_tol)
                mod_cosine_scores = []
                for match_id in library_ids[all_match_ids]:
                    mod_cosine_scores.append(
                        mod_cosine_similarity.pair(
                            documents_library[match_id]._obj,
                            documents_query[i]._obj))
            else:
                mod_cosine_scores = len(all_match_ids) * ["not calculated"]

            matches_df = pd.DataFrame(
                {
                    "cosine_score": [x["score"] for x in cosine_scores],
                    "cosine_matches": [x["matches"] for x in cosine_scores],
                    "mod_cosine_score":
                    [x["score"] for x in mod_cosine_scores],
                    "mod_cosine_matches":
                    [x["matches"] for x in mod_cosine_scores]
                },
                index=library_ids[all_match_ids])

            if m_mass_matches is not None:
                matches_df["mass_match"] = m_mass_matches[all_match_ids, i]

            if m_spec2vec_similarities is not None:
                matches_df["s2v_score"] = m_spec2vec_similarities[
                    all_match_ids, i]
            elif "spec2vec" in include_scores:
                spec2vec_similarity = Spec2Vec(
                    model=model,
                    intensity_weighting_power=intensity_weighting_power,
                    allowed_missing_percentage=allowed_missing_percentage)
                spec2vec_scores = []
                for match_id in library_ids[all_match_ids]:
                    spec2vec_scores.append(
                        spec2vec_similarity.pair(documents_library[match_id],
                                                 documents_query[i]))
                matches_df["s2v_score"] = spec2vec_scores
            found_matches.append(matches_df.fillna(0))
        else:
            found_matches.append([])

    return found_matches
Esempio n. 25
0
def main(argv):
    parser = argparse.ArgumentParser(
        description="Compute MSP similarity scores")
    parser.add_argument("references_filename",
                        type=str,
                        help="Path to reference MSP library.")
    parser.add_argument("queries_filename",
                        type=str,
                        help="Path to query spectra.")
    parser.add_argument("similarity_metric",
                        type=str,
                        help='Metric to use for matching.')
    parser.add_argument("output_filename_scores",
                        type=str,
                        help="Path where to store the output .csv scores.")
    parser.add_argument("output_filename_matches",
                        type=str,
                        help="Path where to store the output .csv matches.")
    parser.add_argument("tolerance",
                        type=float,
                        help="Tolerance to use for peak matching.")
    parser.add_argument(
        "mz_power",
        type=float,
        help="The power to raise mz to in the cosine function.")
    parser.add_argument(
        "intensity_power",
        type=float,
        help="The power to raise intensity to in the cosine function.")

    args = parser.parse_args()

    reference_spectra = load_from_msp(args.references_filename)
    queries_spectra = load_from_msp(args.queries_filename)

    if args.similarity_metric == 'CosineGreedy':
        similarity_metric = CosineGreedy(args.tolerance, args.mz_power,
                                         args.intensity_power)
    elif args.similarity_metric == 'CosineHungarian':
        similarity_metric = CosineHungarian(args.tolerance, args.mz_power,
                                            args.intensity_power)
    elif args.similarity_metric == 'ModifiedCosine':
        similarity_metric = ModifiedCosine(args.tolerance, args.mz_power,
                                           args.intensity_power)
        reference_spectra = map(add_precursor_mz, reference_spectra)
        queries_spectra = map(add_precursor_mz, queries_spectra)
    else:
        return -1

    scores = calculate_scores(
        references=list(reference_spectra),
        queries=list(queries_spectra),
        similarity_function=similarity_metric,
    )

    query_names = [spectra.metadata['name'] for spectra in scores.queries]
    reference_names = [
        spectra.metadata['name'] for spectra in scores.references
    ]

    # Write scores to dataframe
    dataframe_scores = DataFrame(
        data=[entry["score"] for entry in scores.scores],
        index=reference_names,
        columns=query_names)
    dataframe_scores.to_csv(args.output_filename_scores, sep=';')

    # Write number of matches to dataframe
    dataframe_matches = DataFrame(
        data=[entry["matches"] for entry in scores.scores],
        index=reference_names,
        columns=query_names)
    dataframe_matches.to_csv(args.output_filename_matches, sep=';')
    return 0