def create_dummy_scores_symmetric():
    spectrums = create_dummy_spectrums()

    # Create Scores object by calculating dice scores
    similarity_measure = FingerprintSimilarity("dice")
    scores = calculate_scores(spectrums, spectrums, similarity_measure)
    return scores
def test_fingerprint_similarity_with_scores_sorting():
    """Test if score works with Scores.scores_by_query and sorting."""
    spectrum0 = Spectrum(mz=numpy.array([100.0, 101.0], dtype="float"),
                         intensities=numpy.array([0.4, 0.5], dtype="float"),
                         metadata={})

    fingerprint1 = numpy.array(
        [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0])
    spectrum1 = Spectrum(mz=numpy.array([100.0, 101.0], dtype="float"),
                         intensities=numpy.array([0.4, 0.5], dtype="float"),
                         metadata={"fingerprint": fingerprint1})

    fingerprint2 = numpy.array(
        [0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1])
    spectrum2 = Spectrum(mz=numpy.array([100.0, 101.0], dtype="float"),
                         intensities=numpy.array([0.4, 0.5], dtype="float"),
                         metadata={"fingerprint": fingerprint2})

    similarity_measure = FingerprintSimilarity(set_empty_scores=0,
                                               similarity_measure="cosine")

    scores = calculate_scores([spectrum0, spectrum1, spectrum2],
                              [spectrum0, spectrum1, spectrum2],
                              similarity_measure)

    scores_by_ref_sorted = scores.scores_by_query(spectrum1, sort=True)
    expected_scores = numpy.array([1.0, 0.84515425, 0.0])
    assert numpy.allclose(numpy.array([x[1] for x in scores_by_ref_sorted]), expected_scores, atol=1e-6), \
        "Expected different scores and/or order."
def create_dummy_scores_symmetric_modified_cosine():
    spectrums = create_dummy_spectrums()

    # Create Scores object by calculating dice scores
    similarity_measure = ModifiedCosine()
    scores = calculate_scores(spectrums, spectrums, similarity_measure)
    return scores
Exemple #4
0
def test_scores_by_reference_sorted():
    "Test scores_by_reference method with sort=True."
    spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200.]),
                          intensities=numpy.array([0.7, 0.2, 0.1]),
                          metadata={'id': 'spectrum1'})
    spectrum_2 = Spectrum(mz=numpy.array([100, 140, 190.]),
                          intensities=numpy.array([0.4, 0.2, 0.1]),
                          metadata={'id': 'spectrum2'})
    spectrum_3 = Spectrum(mz=numpy.array([110, 140, 195.]),
                          intensities=numpy.array([0.6, 0.2, 0.1]),
                          metadata={'id': 'spectrum3'})
    spectrum_4 = Spectrum(mz=numpy.array([100, 150, 200.]),
                          intensities=numpy.array([0.6, 0.1, 0.6]),
                          metadata={'id': 'spectrum4'})
    references = [spectrum_1, spectrum_2, spectrum_3]
    queries = [spectrum_3, spectrum_4, spectrum_2]

    scores = calculate_scores(references, queries, CosineGreedy())
    selected_scores = scores.scores_by_reference(spectrum_2, sort=True)

    expected_result = [(scores.queries[i], scores.scores[1, i])
                       for i in [2, 1, 0]]
    assert selected_scores == expected_result, "Expected different scores."
    scores_only = numpy.array([x[1]["score"] for x in selected_scores])
    scores_expected = numpy.array(
        [1.0, 0.6129713330865563, 0.1363196353181994])
    assert numpy.allclose(scores_only, scores_expected, atol=1e-8), \
        "Expected different sorted scores."
def library_match(spectra_list,
                  lib_mgf,
                  precursor_tol=1.0,
                  cosine=0.7,
                  n_peaks=3):
    """Reads a given library mgf file and matches the given spectra to the library spectra using normal cosine.
    Each test spectra is given the name of the library spectra match with the highest cosine score."""

    library = load_from_mgf(lib_mgf)

    # Apply filters to clean and enhance each spectrum
    library_spectra = []
    for spectrum in library:
        # spectrum = default_filters(spectrum)
        # Scale peak intensities to maximum of 1
        spectrum = normalize_intensities(spectrum)
        library_spectra.append(spectrum)

    scores = calculate_scores(references=library_spectra,
                              queries=spectra_list,
                              similarity_function=CosineHungarian())

    scores_list = []
    for score in scores:
        print(score)
        scores_list.append(score)

    scores_list.sort(reverse=True, key=lambda tuple: tuple[2])
Exemple #6
0
def create_dummy_scores():
    spectrums = create_dummy_spectrums()
    references = spectrums[:5]
    queries = spectrums[5:]

    # Create Scores object by calculating dice scores
    similarity_measure = FingerprintSimilarity("dice")
    scores = calculate_scores(references, queries, similarity_measure)
    return scores
Exemple #7
0
def test_metadata_match_strings(spectrums):
    """Test basic metadata matching between string entries."""
    references = spectrums[:2]
    queries = spectrums[2:]

    similarity_score = MetadataMatch(field="instrument_type")
    scores = calculate_scores(references, queries, similarity_score)
    assert np.all(
        scores.scores == [[1, 0], [0, 0]]), "Expected different scores."
Exemple #8
0
def test_metadata_match_numerical(spectrums, tolerance, expected):
    """Test basic metadata matching between numerical entries."""
    references = spectrums[:2]
    queries = spectrums[2:]

    similarity_score = MetadataMatch(field="retention_time",
                                     matching_type="difference",
                                     tolerance=tolerance)
    scores = calculate_scores(references, queries, similarity_score)
    assert np.all(scores.scores == expected), "Expected different scores."
def test_user_workflow():

    def apply_my_filters(s):
        s = default_filters(s)
        s = add_parent_mass(s)
        s = normalize_intensities(s)
        s = select_by_relative_intensity(s, intensity_from=0.0, intensity_to=1.0)
        s = select_by_mz(s, mz_from=0, mz_to=1000)
        s = require_minimum_number_of_peaks(s, n_required=5)
        return s

    module_root = os.path.join(os.path.dirname(__file__), "..")
    spectrums_file = os.path.join(module_root, "tests", "pesticides.mgf")

    # apply my filters to the data
    spectrums = [apply_my_filters(s) for s in load_from_mgf(spectrums_file)]

    # omit spectrums that didn't qualify for analysis
    spectrums = [s for s in spectrums if s is not None]

    # this will be a library grouping analysis, so queries = references = spectrums
    queries = spectrums[:]
    references = spectrums[:]

    # define similarity function
    cosine_greedy = CosineGreedy(tolerance=0.3)

    # calculate_scores
    scores = list(calculate_scores(references,
                                   queries,
                                   cosine_greedy))

    # filter out self-comparisons, require at least 20 matching peaks:
    filtered = [(reference, query, score, n_matching) for (reference, query, score, n_matching) in scores
                if reference != query and n_matching >= 20]

    sorted_by_score = sorted(filtered, key=lambda elem: elem[2], reverse=True)

    actual_top10 = sorted_by_score[:10]

    expected_top10 = [
        (references[48], queries[50], pytest.approx(0.9994783627790965, rel=1e-9), 25),
        (references[50], queries[48], pytest.approx(0.9994783627790965, rel=1e-9), 25),
        (references[46], queries[48], pytest.approx(0.9990141860269471, rel=1e-9), 27),
        (references[48], queries[46], pytest.approx(0.9990141860269471, rel=1e-9), 27),
        (references[46], queries[50], pytest.approx(0.9988793406908719, rel=1e-9), 22),
        (references[50], queries[46], pytest.approx(0.9988793406908719, rel=1e-9), 22),
        (references[57], queries[59], pytest.approx(0.9982171275552505, rel=1e-9), 46),
        (references[59], queries[57], pytest.approx(0.9982171275552505, rel=1e-9), 46),
        (references[73], queries[74], pytest.approx(0.9973823244169199, rel=1e-9), 23),
        (references[74], queries[73], pytest.approx(0.9973823244169199, rel=1e-9), 23),
    ]
    assert actual_top10 == expected_top10
Exemple #10
0
def test_metadata_match_strings_wrong_method(spectrums, caplog):
    """Test basic metadata matching between string entries."""
    references = spectrums[:2]
    queries = spectrums[2:]

    similarity_score = MetadataMatch(field="instrument_type",
                                     matching_type="difference")
    scores = calculate_scores(references, queries, similarity_score)
    assert np.all(
        scores.scores == [[0, 0], [0, 0]]), "Expected different scores."
    msg = "not compatible with 'difference' method"
    assert msg in caplog.text
Exemple #11
0
def test_scores_by_query_non_tuple_score():
    "Test scores_by_query method."
    spectrum_1, spectrum_2, spectrum_3, spectrum_4 = spectra()
    references = [spectrum_1, spectrum_2, spectrum_3]
    queries = [spectrum_2, spectrum_3, spectrum_4]

    scores = calculate_scores(references, queries, IntersectMz())
    selected_scores = scores.scores_by_query(spectrum_4)

    expected_result = [(scores.references[i], scores.scores[i, 2])
                       for i in range(3)]
    assert selected_scores == expected_result, "Expected different scores."
Exemple #12
0
def test_scores_by_referencey():
    "Test scores_by_reference method."
    spectrum_1, spectrum_2, spectrum_3, spectrum_4 = spectra()
    references = [spectrum_1, spectrum_2, spectrum_3]
    queries = [spectrum_3, spectrum_4]

    scores = calculate_scores(references, queries, CosineGreedy())
    selected_scores = scores.scores_by_reference(spectrum_2)

    expected_result = [(scores.queries[i], scores.scores[1, i])
                       for i in range(2)]
    assert selected_scores == expected_result, "Expected different scores."
def test_user_workflow_spec2vec():
    def apply_my_filters(s):
        s = default_filters(s)
        s = add_parent_mass(s)
        s = add_losses(s)
        s = normalize_intensities(s)
        s = select_by_relative_intensity(s,
                                         intensity_from=0.01,
                                         intensity_to=1.0)
        s = select_by_mz(s, mz_from=0, mz_to=1000)
        s = require_minimum_number_of_peaks(s, n_required=5)
        return s

    module_root = os.path.join(os.path.dirname(__file__), '..')
    spectrums_file = os.path.join(module_root, 'tests', 'pesticides.mgf')

    # apply my filters to the data
    spectrums = [apply_my_filters(s) for s in load_from_mgf(spectrums_file)]

    # omit spectrums that didn't qualify for analysis
    spectrums = [s for s in spectrums if s is not None]

    documents = [SpectrumDocument(s) for s in spectrums]

    # create and train model
    model = gensim.models.Word2Vec([d.words for d in documents],
                                   size=5,
                                   min_count=1)
    model.train([d.words for d in documents],
                total_examples=len(documents),
                epochs=20)

    # define similarity_function
    spec2vec = Spec2Vec(model=model, documents=documents)

    references = documents[:26]
    queries = documents[25:]

    # calculate scores on all combinations of references and queries
    scores = list(calculate_scores(references, queries, spec2vec))

    # filter out self-comparisons
    filtered = [(reference, query, score)
                for (reference, query, score) in scores if reference != query]

    sorted_by_score = sorted(filtered, key=lambda elem: elem[2], reverse=True)

    actual_top10 = sorted_by_score[:10]

    actual_scores = [score for (reference, query, score) in actual_top10]

    assert max(actual_scores) > 0.99
Exemple #14
0
def test_scores_by_reference_sorted():
    "Test scores_by_reference method with sort=True."
    spectrum_1, spectrum_2, spectrum_3, spectrum_4 = spectra()
    references = [spectrum_1, spectrum_2, spectrum_3]
    queries = [spectrum_3, spectrum_4, spectrum_2]

    scores = calculate_scores(references, queries, CosineGreedy())
    selected_scores = scores.scores_by_reference(spectrum_2, sort=True)

    expected_result = [(scores.queries[i], scores.scores[1, i])
                       for i in [2, 1, 0]]
    assert selected_scores == expected_result, "Expected different scores."
    scores_only = numpy.array([x[1]["score"] for x in selected_scores])
    scores_expected = numpy.array(
        [1.0, 0.6129713330865563, 0.1363196353181994])
    assert numpy.allclose(scores_only, scores_expected, atol=1e-8), \
        "Expected different sorted scores."
Exemple #15
0
def test_scores_by_query_non_tuple_score():
    "Test scores_by_query method."
    spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200.]),
                          intensities=numpy.array([0.7, 0.2, 0.1]),
                          metadata={'id': 'spectrum1'})
    spectrum_2 = Spectrum(mz=numpy.array([100, 140, 190.]),
                          intensities=numpy.array([0.4, 0.2, 0.1]),
                          metadata={'id': 'spectrum2'})
    spectrum_3 = Spectrum(mz=numpy.array([110, 140, 195.]),
                          intensities=numpy.array([0.6, 0.2, 0.1]),
                          metadata={'id': 'spectrum3'})
    spectrum_4 = Spectrum(mz=numpy.array([100, 150, 200.]),
                          intensities=numpy.array([0.6, 0.1, 0.6]),
                          metadata={'id': 'spectrum4'})
    references = [spectrum_1, spectrum_2, spectrum_3]
    queries = [spectrum_2, spectrum_3, spectrum_4]

    scores = calculate_scores(references, queries, IntersectMz())
    selected_scores = scores.scores_by_query(spectrum_4)

    expected_result = [(scores.references[i], scores.scores[i, 2])
                       for i in range(3)]
    assert selected_scores == expected_result, "Expected different scores."
Exemple #16
0
def test_scores_by_query_sorted():
    "Test scores_by_query method with sort=True."
    spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200.]),
                          intensities=numpy.array([0.7, 0.2, 0.1]),
                          metadata={'id': 'spectrum1'})
    spectrum_2 = Spectrum(mz=numpy.array([100, 140, 190.]),
                          intensities=numpy.array([0.4, 0.2, 0.1]),
                          metadata={'id': 'spectrum2'})
    spectrum_3 = Spectrum(mz=numpy.array([100, 140, 195.]),
                          intensities=numpy.array([0.6, 0.2, 0.1]),
                          metadata={'id': 'spectrum3'})
    spectrum_4 = Spectrum(mz=numpy.array([100, 150, 200.]),
                          intensities=numpy.array([0.6, 0.1, 0.6]),
                          metadata={'id': 'spectrum4'})
    references = [spectrum_1, spectrum_2, spectrum_3]
    queries = [spectrum_2, spectrum_3, spectrum_4]

    scores = calculate_scores(references, queries, CosineGreedy())
    selected_scores = scores.scores_by_query(spectrum_4, sort=True)

    expected_result = [(scores.references[i], scores.scores[i, 2])
                       for i in [0, 2, 1]]
    assert selected_scores == expected_result, "Expected different scores."
Exemple #17
0
def test_scores_by_referencey():
    "Test scores_by_reference method."
    spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200.]),
                          intensities=numpy.array([0.7, 0.2, 0.1]),
                          metadata={'id': 'spectrum1'})
    spectrum_2 = Spectrum(mz=numpy.array([100, 140, 190.]),
                          intensities=numpy.array([0.4, 0.2, 0.1]),
                          metadata={'id': 'spectrum2'})
    spectrum_3 = Spectrum(mz=numpy.array([110, 140, 195.]),
                          intensities=numpy.array([0.6, 0.2, 0.1]),
                          metadata={'id': 'spectrum3'})
    spectrum_4 = Spectrum(mz=numpy.array([100, 150, 200.]),
                          intensities=numpy.array([0.6, 0.1, 0.6]),
                          metadata={'id': 'spectrum4'})
    references = [spectrum_1, spectrum_2, spectrum_3]
    queries = [spectrum_3, spectrum_4]

    scores = calculate_scores(references, queries, CosineGreedy())
    selected_scores = scores.scores_by_reference(spectrum_2)

    expected_result = [(scores.queries[i], scores.scores[1, i])
                       for i in range(2)]
    assert selected_scores == expected_result, "Expected different scores."
    file = load_from_mgf(input_mgf)
    print(file)

    print("normalising intensities")

    # Apply filters to clean and enhance each spectrum
    spectrums = []
    for spectrum in file:
        spectrum = default_filters(spectrum)
        # Scale peak intensities to maximum of 1
        spectrum = normalize_intensities(spectrum)
        print(spectrum.get('precursor_mz'))
        spectrums.append(spectrum)

    scores = calculate_scores(
        references=spectrums,
        queries=spectrums,
        similarity_function=ModifiedCosine(tolerance=args.fragment_tolerance))

    spectra_matches = convert.convert_scores(scores)
    spectra_list = []
    for s in spectrums:
        new = convert.convert_spectrum(s)
        spectra_list.append(new)

else:
    from msmolnet import read_mgf as mgf

    input_mgf = f'{args.input}.mgf'
    print(f"reading file {input_mgf}")
    spectra_list = mgf.read_mgf(input_mgf)
Exemple #19
0
def main(argv):
    parser = argparse.ArgumentParser(
        description="Compute MSP similarity scores")
    parser.add_argument("references_filename",
                        type=str,
                        help="Path to reference MSP library.")
    parser.add_argument("queries_filename",
                        type=str,
                        help="Path to query spectra.")
    parser.add_argument("similarity_metric",
                        type=str,
                        help='Metric to use for matching.')
    parser.add_argument("output_filename_scores",
                        type=str,
                        help="Path where to store the output .csv scores.")
    parser.add_argument("output_filename_matches",
                        type=str,
                        help="Path where to store the output .csv matches.")
    parser.add_argument("tolerance",
                        type=float,
                        help="Tolerance to use for peak matching.")
    parser.add_argument(
        "mz_power",
        type=float,
        help="The power to raise mz to in the cosine function.")
    parser.add_argument(
        "intensity_power",
        type=float,
        help="The power to raise intensity to in the cosine function.")

    args = parser.parse_args()

    reference_spectra = load_from_msp(args.references_filename)
    queries_spectra = load_from_msp(args.queries_filename)

    if args.similarity_metric == 'CosineGreedy':
        similarity_metric = CosineGreedy(args.tolerance, args.mz_power,
                                         args.intensity_power)
    elif args.similarity_metric == 'CosineHungarian':
        similarity_metric = CosineHungarian(args.tolerance, args.mz_power,
                                            args.intensity_power)
    elif args.similarity_metric == 'ModifiedCosine':
        similarity_metric = ModifiedCosine(args.tolerance, args.mz_power,
                                           args.intensity_power)
        reference_spectra = map(add_precursor_mz, reference_spectra)
        queries_spectra = map(add_precursor_mz, queries_spectra)
    else:
        return -1

    scores = calculate_scores(
        references=list(reference_spectra),
        queries=list(queries_spectra),
        similarity_function=similarity_metric,
    )

    query_names = [spectra.metadata['name'] for spectra in scores.queries]
    reference_names = [
        spectra.metadata['name'] for spectra in scores.references
    ]

    # Write scores to dataframe
    dataframe_scores = DataFrame(
        data=[entry["score"] for entry in scores.scores],
        index=reference_names,
        columns=query_names)
    dataframe_scores.to_csv(args.output_filename_scores, sep=';')

    # Write number of matches to dataframe
    dataframe_matches = DataFrame(
        data=[entry["matches"] for entry in scores.scores],
        index=reference_names,
        columns=query_names)
    dataframe_matches.to_csv(args.output_filename_matches, sep=';')
    return 0
Exemple #20
0
def main(argv):
    parser = argparse.ArgumentParser(
        description="Compute MSP similarity scores")
    parser.add_argument("-f",
                        dest="default_filters",
                        action='store_true',
                        help="Apply default filters")
    parser.add_argument("-n",
                        dest="normalize_intensities",
                        action='store_true',
                        help="Normalize intensities.")
    parser.add_argument("-s",
                        dest="symmetric",
                        action='store_true',
                        help="Computation is symmetric.")
    parser.add_argument("--ref",
                        dest="references_filename",
                        type=str,
                        help="Path to reference MSP library.")
    parser.add_argument("queries_filename",
                        type=str,
                        help="Path to query spectra.")
    parser.add_argument("similarity_metric",
                        type=str,
                        help='Metric to use for matching.')
    parser.add_argument("tolerance",
                        type=float,
                        help="Tolerance to use for peak matching.")
    parser.add_argument(
        "mz_power",
        type=float,
        help="The power to raise mz to in the cosine function.")
    parser.add_argument(
        "intensity_power",
        type=float,
        help="The power to raise intensity to in the cosine function.")
    parser.add_argument("output_filename_scores",
                        type=str,
                        help="Path where to store the output .csv scores.")
    parser.add_argument("output_filename_matches",
                        type=str,
                        help="Path where to store the output .csv matches.")
    args = parser.parse_args()

    queries_spectra = list(load_from_msp(args.queries_filename))
    if args.symmetric:
        reference_spectra = []
    else:
        reference_spectra = list(load_from_msp(args.references_filename))

    if args.default_filters is True:
        print("Applying default filters...")
        queries_spectra = list(map(default_filters, queries_spectra))
        reference_spectra = list(map(default_filters, reference_spectra))

    if args.normalize_intensities is True:
        print("Normalizing intensities...")
        queries_spectra = list(map(normalize_intensities, queries_spectra))
        reference_spectra = list(map(normalize_intensities, reference_spectra))

    if args.similarity_metric == 'CosineGreedy':
        similarity_metric = CosineGreedy(args.tolerance, args.mz_power,
                                         args.intensity_power)
    elif args.similarity_metric == 'CosineHungarian':
        similarity_metric = CosineHungarian(args.tolerance, args.mz_power,
                                            args.intensity_power)
    elif args.similarity_metric == 'ModifiedCosine':
        similarity_metric = ModifiedCosine(args.tolerance, args.mz_power,
                                           args.intensity_power)
        reference_spectra = list(map(add_precursor_mz, reference_spectra))
        queries_spectra = list(map(add_precursor_mz, queries_spectra))
    else:
        return -1

    print("Calculating scores...")
    scores = calculate_scores(
        references=queries_spectra if args.symmetric else reference_spectra,
        queries=queries_spectra,
        similarity_function=similarity_metric,
        is_symmetric=args.symmetric)

    write_outputs(args, scores)
    return 0
def test_user_workflow():
    def apply_my_filters(s):
        s = default_filters(s)
        s = add_parent_mass(s)
        s = normalize_intensities(s)
        s = select_by_relative_intensity(s,
                                         intensity_from=0.0,
                                         intensity_to=1.0)
        s = select_by_mz(s, mz_from=0, mz_to=1000)
        s = require_minimum_number_of_peaks(s, n_required=5)
        return s

    module_root = os.path.join(os.path.dirname(__file__), '..')
    spectrums_file = os.path.join(module_root, 'tests', 'pesticides.mgf')

    # apply my filters to the data
    spectrums = [apply_my_filters(s) for s in load_from_mgf(spectrums_file)]

    # omit spectrums that didn't qualify for analysis
    spectrums = [s for s in spectrums if s is not None]

    # this will be a library grouping analysis, so queries = references = spectrums
    queries = spectrums[:]
    references = spectrums[:]

    # define similarity function
    cosine_greedy = CosineGreedy()

    # calculate_scores
    scores = list(calculate_scores(references, queries, cosine_greedy))

    # filter out self-comparisons, require at least 20 matching peaks:
    filtered = [(reference, query, score, n_matching)
                for (reference, query, score, n_matching) in scores
                if reference != query and n_matching >= 20]

    sorted_by_score = sorted(filtered, key=lambda elem: elem[2], reverse=True)

    actual_top10 = sorted_by_score[:10]

    expected_top10 = [
        (references[48], queries[50],
         pytest.approx(0.9994510368270997, rel=1e-9), 25),
        (references[50], queries[48],
         pytest.approx(0.9994510368270997, rel=1e-9), 25),
        (references[46], queries[48],
         pytest.approx(0.9981252309590571, rel=1e-9), 27),
        (references[48], queries[46],
         pytest.approx(0.9981252309590571, rel=1e-9), 27),
        (references[46], queries[50],
         pytest.approx(0.9979632203390496, rel=1e-9), 22),
        (references[50], queries[46],
         pytest.approx(0.9979632203390496, rel=1e-9), 22),
        (references[73], queries[74],
         pytest.approx(0.9956795920716246, rel=1e-9), 23),
        (references[74], queries[73],
         pytest.approx(0.9956795920716246, rel=1e-9), 23),
        (references[57], queries[59],
         pytest.approx(0.9886557001269415, rel=1e-9), 46),
        (references[59], queries[57],
         pytest.approx(0.9886557001269415, rel=1e-9), 46),
    ]
    assert actual_top10 == expected_top10
def main():
    parser = argparse.ArgumentParser(description='Creating Spec2Vec Pairs')
    parser.add_argument('input_mgf', help='input_mgf')
    parser.add_argument('output_pairs', help='output_pairs')
    parser.add_argument('model_file', help='model_file')
    parser.add_argument('--min_score',
                        type=float,
                        default=0.7,
                        help='model_file')
    args = parser.parse_args()

    spectra = load_from_mgf(args.input_mgf)

    filtered_spectra = [post_process(s) for s in spectra]

    # Omit spectrums that didn't qualify for analysis
    filtered_spectra = [s for s in filtered_spectra if s is not None]

    # Create spectrum documents
    query_documents = [
        SpectrumDocument(s, n_decimals=2) for s in filtered_spectra
    ]

    #DEBUG
    #query_documents = query_documents[:100]

    # Loading the model
    model = gensim.models.Word2Vec.load(args.model_file)

    # Define similarity_function
    spec2vec = Spec2Vec(model=model,
                        intensity_weighting_power=0.5,
                        allowed_missing_percentage=80.0)

    print("total documents", len(query_documents))
    scores = calculate_scores(query_documents, query_documents,
                              spec2vec).scores

    number_of_spectra = len(query_documents)

    output_scores_list = []
    for i in range(number_of_spectra):
        for j in range(number_of_spectra):
            if i <= j:
                continue

            i_spectrum = filtered_spectra[i]
            j_spectrum = filtered_spectra[j]

            sim = scores[i][j]

            if sim < args.min_score:
                continue

            score_dict = {}
            score_dict["filename"] = args.input_mgf
            score_dict["CLUSTERID1"] = i_spectrum.metadata["scans"]
            score_dict["CLUSTERID2"] = j_spectrum.metadata["scans"]
            score_dict["Cosine"] = sim
            score_dict["mz1"] = i_spectrum.metadata["pepmass"][0]
            score_dict["mz2"] = j_spectrum.metadata["pepmass"][0]
            score_dict["DeltaMZ"] = score_dict["mz2"] - score_dict["mz1"]
            score_dict["EdgeAnnotation"] = "Spec2Vec"

            output_scores_list.append(score_dict)

    # Saving Data Out
    pd.DataFrame(output_scores_list).to_csv(args.output_pairs,
                                            sep="\t",
                                            index=False)
Exemple #23
0
def main(argv):
    parser = argparse.ArgumentParser(
        description="Compute MSP similarity scores")
    parser.add_argument("-s",
                        dest="symmetric",
                        action='store_true',
                        help="Computation is symmetric.")
    parser.add_argument("--ref",
                        dest="references_filename",
                        type=str,
                        help="Path to reference spectra library.")
    parser.add_argument("--ref_format",
                        dest="references_format",
                        type=str,
                        help="Reference spectra library file format.")
    parser.add_argument("queries_filename",
                        type=str,
                        help="Path to query spectra.")
    parser.add_argument("queries_format",
                        type=str,
                        help="Query spectra file format.")
    parser.add_argument("similarity_metric",
                        type=str,
                        help='Metric to use for matching.')
    parser.add_argument("tolerance",
                        type=float,
                        help="Tolerance to use for peak matching.")
    parser.add_argument(
        "mz_power",
        type=float,
        help="The power to raise mz to in the cosine function.")
    parser.add_argument(
        "intensity_power",
        type=float,
        help="The power to raise intensity to in the cosine function.")
    parser.add_argument("output_filename_scores",
                        type=str,
                        help="Path where to store the output .tsv scores.")
    parser.add_argument("output_filename_matches",
                        type=str,
                        help="Path where to store the output .tsv matches.")
    args = parser.parse_args()

    if args.queries_format == 'msp':
        queries_spectra = list(load_from_msp(args.queries_filename))
    elif args.queries_format == 'mgf':
        queries_spectra = list(load_from_mgf(args.queries_filename))
    else:
        raise ValueError(
            f'File format {args.queries_format} not supported for query spectra.'
        )

    if args.symmetric:
        reference_spectra = []
    else:
        if args.references_format == 'msp':
            reference_spectra = list(load_from_msp(args.references_filename))
        elif args.references_format == 'mgf':
            reference_spectra = list(load_from_mgf(args.references_filename))
        else:
            raise ValueError(
                f'File format {args.references_format} not supported for reference spectra library.'
            )

    if args.similarity_metric == 'CosineGreedy':
        similarity_metric = CosineGreedy(args.tolerance, args.mz_power,
                                         args.intensity_power)
    elif args.similarity_metric == 'CosineHungarian':
        similarity_metric = CosineHungarian(args.tolerance, args.mz_power,
                                            args.intensity_power)
    elif args.similarity_metric == 'ModifiedCosine':
        similarity_metric = ModifiedCosine(args.tolerance, args.mz_power,
                                           args.intensity_power)
        reference_spectra = list(map(convert_precursor_mz, reference_spectra))
        queries_spectra = list(map(convert_precursor_mz, queries_spectra))
    else:
        return -1

    print("Calculating scores...")
    scores = calculate_scores(
        references=queries_spectra if args.symmetric else reference_spectra,
        queries=queries_spectra,
        similarity_function=similarity_metric,
        is_symmetric=args.symmetric)

    write_outputs(args, scores)
    return 0
Exemple #24
0
def test_user_workflow_spec2vec():
    """Test typical user workflow to get from mass spectra to spec2vec similarities.

    This test will run a typical workflow example using a small dataset and a
    pretrained word2vec model. One main aspect of this is to test if users will
    get exactly the same spec2vec similarity scores when starting from a word2vec
    model that was trained and saved elsewhere.
    """
    def apply_my_filters(s):
        """This is how a user would typically design his own pre- and post-
        processing pipeline."""
        s = default_filters(s)
        s = add_parent_mass(s)
        s = normalize_intensities(s)
        s = reduce_to_number_of_peaks(s, n_required=10, ratio_desired=0.5)
        s = select_by_mz(s, mz_from=0, mz_to=1000)
        s = add_losses(s, loss_mz_from=10.0, loss_mz_to=200.0)
        s = require_minimum_number_of_peaks(s, n_required=5)
        return s

    repository_root = os.path.join(os.path.dirname(__file__), "..")
    spectrums_file = os.path.join(repository_root, "tests", "pesticides.mgf")

    # apply my filters to the data
    spectrums = [apply_my_filters(s) for s in load_from_mgf(spectrums_file)]

    # omit spectrums that didn't qualify for analysis
    spectrums = [s for s in spectrums if s is not None]

    documents = [SpectrumDocument(s) for s in spectrums]

    model_file = os.path.join(repository_root, "integration-tests",
                              "test_user_workflow_spec2vec.model")
    if os.path.isfile(model_file):
        model = gensim.models.Word2Vec.load(model_file)
    else:
        # create and train model
        model = gensim.models.Word2Vec([d.words for d in documents],
                                       size=5,
                                       min_count=1)
        model.train([d.words for d in documents],
                    total_examples=len(documents),
                    epochs=20)
        model.save(model_file)

    # define similarity_function
    spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5)

    references = documents[:26]
    queries = documents[25:]

    # calculate scores on all combinations of references and queries
    scores = list(calculate_scores(references, queries, spec2vec))

    # filter out self-comparisons
    filtered = [(reference, query, score)
                for (reference, query, score) in scores if reference != query]

    sorted_by_score = sorted(filtered, key=lambda elem: elem[2], reverse=True)

    actual_top10 = sorted_by_score[:10]

    expected_top10 = [(documents[19], documents[25],
                       pytest.approx(0.9999121928249473, rel=1e-9)),
                      (documents[20], documents[25],
                       pytest.approx(0.9998846890269892, rel=1e-9)),
                      (documents[20], documents[45],
                       pytest.approx(0.9998756073673759, rel=1e-9)),
                      (documents[25], documents[45],
                       pytest.approx(0.9998750427994474, rel=1e-9)),
                      (documents[19], documents[27],
                       pytest.approx(0.9998722768460854, rel=1e-9)),
                      (documents[22], documents[27],
                       pytest.approx(0.9998633023352553, rel=1e-9)),
                      (documents[18], documents[27],
                       pytest.approx(0.9998616961532616, rel=1e-9)),
                      (documents[19], documents[45],
                       pytest.approx(0.9998528723697396, rel=1e-9)),
                      (documents[14], documents[71],
                       pytest.approx(0.9998404364805897, rel=1e-9)),
                      (documents[20], documents[27],
                       pytest.approx(0.9998336807761137, rel=1e-9))]

    assert actual_top10 == expected_top10, "Expected different top 10 table."