def test_save_load_mgf_consistency(tmpdir, charge, ionmode, parent_mass): """Test saving and loading spectrum to .mgf file""" mz = numpy.array([100.1, 200.02, 300.003], dtype="float") intensities = numpy.array([0.01, 0.02, 1.0], dtype="float") metadata = { "precursor_mz": 200.5, "charge": charge, "ionmode": ionmode, "parent_mass": parent_mass } builder = SpectrumBuilder().with_mz(mz).with_intensities(intensities) spectrum1 = builder.with_metadata(metadata, metadata_harmonization=True).build() spectrum2 = builder.with_metadata(metadata, metadata_harmonization=True).build() # Write to test file filename = os.path.join(tmpdir, "test.mgf") save_as_mgf([spectrum1, spectrum2], filename) # Test if file exists assert os.path.isfile(filename) # Test importing spectra again spectrum_imports = list(load_from_mgf(filename)) assert spectrum_imports[0].get("precursor_mz") == 200.5 assert spectrum_imports[0].get("charge") == charge assert spectrum_imports[0].get("ionmode") == ionmode assert spectrum_imports[0].get("parent_mass") == str(parent_mass)
def predict_similarity_of_pair( self, mgf_path: str, ) -> float: """ Predicts molecular structural similarities (Tanimoto scores) from pairs of mass spectrometry spectra in local mgf files using MS2DeepScore algorithm Parameters ---------- mgf_path: str Local path to mgf file with two spectra Returns ------- Molecular structural similarities (Tanimoto scores) """ spectra_generator = load_from_mgf(mgf_path) payload = self._build_payload(spectra_generator) response = self._send_request(payload) prediction = self._format_results(response) return prediction
def convert_files_to_matchms_spectrum_objects( file_name) -> Union[List[Spectrum], None]: """Loads spectra from your spectrum file into memory as matchms Spectrum object The following file extensions can be loaded in with this function: "mzML", "json", "mgf", "msp", "mzxml", "usi" and "pickle". A pickled file is expected to directly contain a list of matchms spectrum objects. Args: ----- file_name: Path to file containing spectra, with file extension "mzML", "json", "mgf", "msp", "mzxml", "usi" or "pickle" """ assert os.path.exists( file_name), f"The specified file: {file_name} does not exists" file_extension = os.path.splitext(file_name)[1].lower() if file_extension == ".mzml": return list(importing.load_from_mzml(file_name)) if file_extension == ".json": return list(importing.load_from_json(file_name)) if file_extension == ".mgf": return list(importing.load_from_mgf(file_name)) if file_extension == ".msp": return list(importing.load_from_msp(file_name)) if file_extension == ".mzxml": return list(importing.load_from_mzxml(file_name)) if file_extension == ".usi": return list(importing.load_from_usi(file_name)) if file_extension == ".pickle": return load_pickled_file(file_name) print(f"File extension of file: {file_name} is not recognized") return None
def library_match(spectra_list, lib_mgf, precursor_tol=1.0, cosine=0.7, n_peaks=3): """Reads a given library mgf file and matches the given spectra to the library spectra using normal cosine. Each test spectra is given the name of the library spectra match with the highest cosine score.""" library = load_from_mgf(lib_mgf) # Apply filters to clean and enhance each spectrum library_spectra = [] for spectrum in library: # spectrum = default_filters(spectrum) # Scale peak intensities to maximum of 1 spectrum = normalize_intensities(spectrum) library_spectra.append(spectrum) scores = calculate_scores(references=library_spectra, queries=spectra_list, similarity_function=CosineHungarian()) scores_list = [] for score in scores: print(score) scores_list.append(score) scores_list.sort(reverse=True, key=lambda tuple: tuple[2])
def match_spectra_from_path( self, mgf_path: str, n_best: int, include_metadata: List[str] = None, ion_mode: str = "positive", ) -> List[pd.DataFrame]: """ Finds the N best matches for spectra in a local mgf file using spec2vec algorithm. Parameters ---------- mgf_path: str Local path to mgf file n_best: int Number of best matches to select include_metadata: List[str] Metadata keys to include in the response. Will make response slower. Please check the documentation for a list of valid keys. ion_mode: str Selects which model will be used for the predictions: Either a model trained with positive or negative ion mode spectra data. Defaults to positive. Returns ------- A list of pandas dataframes containing the best matches and optionally metadata for these matches. """ # validates input if ion_mode not in ["positive", "negative"]: raise ValueError( "Parameter ion_mode should be either set to 'positive' or 'negative. Defaults to 'positive'.'" ) parameters = self._build_parameters(n_best, include_metadata) # loads spectra spectra_generator = load_from_mgf(mgf_path) # defines endpoint based on user choice of spectra ion mode endpoint = self._PREDICT_ENDPOINT_BASE.format(ion_mode=ion_mode) # issue requests respecting the spectra limit per request batch = [] requests = [] for spectrum in spectra_generator: batch.append(spectrum) if len(batch) == SPECTRA_LIMIT_PER_REQUEST: payload = self._build_payload(batch, parameters) requests.append(self._send_request(payload, endpoint)) batch = [] if batch: payload = self._build_payload(batch, parameters) requests.append(self._send_request(payload, endpoint)) predictions = [] for r in requests: predictions.extend(self._format_results(r)) return predictions
def test_load_from_mgf_using_filepath(): module_root = os.path.join(os.path.dirname(__file__), "..") spectra_file = os.path.join(module_root, "tests", "pesticides.mgf") spectra = list(load_from_mgf(spectra_file)) assert len(spectra) > 0 assert isinstance(spectra[0], Spectrum)
def test_load_from_mgf_using_file(): module_root = os.path.join(os.path.dirname(__file__), "..") spectra_filepath = os.path.join(module_root, "tests", "pesticides.mgf") with open(spectra_filepath, "r", encoding="utf-8") as spectra_file: spectra = list(load_from_mgf(spectra_file)) assert len(spectra) > 0 assert isinstance(spectra[0], Spectrum)
def test_user_workflow(): def apply_my_filters(s): s = default_filters(s) s = add_parent_mass(s) s = normalize_intensities(s) s = select_by_relative_intensity(s, intensity_from=0.0, intensity_to=1.0) s = select_by_mz(s, mz_from=0, mz_to=1000) s = require_minimum_number_of_peaks(s, n_required=5) return s module_root = os.path.join(os.path.dirname(__file__), "..") spectrums_file = os.path.join(module_root, "tests", "pesticides.mgf") # apply my filters to the data spectrums = [apply_my_filters(s) for s in load_from_mgf(spectrums_file)] # omit spectrums that didn't qualify for analysis spectrums = [s for s in spectrums if s is not None] # this will be a library grouping analysis, so queries = references = spectrums queries = spectrums[:] references = spectrums[:] # define similarity function cosine_greedy = CosineGreedy(tolerance=0.3) # calculate_scores scores = list(calculate_scores(references, queries, cosine_greedy)) # filter out self-comparisons, require at least 20 matching peaks: filtered = [(reference, query, score, n_matching) for (reference, query, score, n_matching) in scores if reference != query and n_matching >= 20] sorted_by_score = sorted(filtered, key=lambda elem: elem[2], reverse=True) actual_top10 = sorted_by_score[:10] expected_top10 = [ (references[48], queries[50], pytest.approx(0.9994783627790965, rel=1e-9), 25), (references[50], queries[48], pytest.approx(0.9994783627790965, rel=1e-9), 25), (references[46], queries[48], pytest.approx(0.9990141860269471, rel=1e-9), 27), (references[48], queries[46], pytest.approx(0.9990141860269471, rel=1e-9), 27), (references[46], queries[50], pytest.approx(0.9988793406908719, rel=1e-9), 22), (references[50], queries[46], pytest.approx(0.9988793406908719, rel=1e-9), 22), (references[57], queries[59], pytest.approx(0.9982171275552505, rel=1e-9), 46), (references[59], queries[57], pytest.approx(0.9982171275552505, rel=1e-9), 46), (references[73], queries[74], pytest.approx(0.9973823244169199, rel=1e-9), 23), (references[74], queries[73], pytest.approx(0.9973823244169199, rel=1e-9), 23), ] assert actual_top10 == expected_top10
def test_user_workflow_spec2vec(): def apply_my_filters(s): s = default_filters(s) s = add_parent_mass(s) s = add_losses(s) s = normalize_intensities(s) s = select_by_relative_intensity(s, intensity_from=0.01, intensity_to=1.0) s = select_by_mz(s, mz_from=0, mz_to=1000) s = require_minimum_number_of_peaks(s, n_required=5) return s module_root = os.path.join(os.path.dirname(__file__), '..') spectrums_file = os.path.join(module_root, 'tests', 'pesticides.mgf') # apply my filters to the data spectrums = [apply_my_filters(s) for s in load_from_mgf(spectrums_file)] # omit spectrums that didn't qualify for analysis spectrums = [s for s in spectrums if s is not None] documents = [SpectrumDocument(s) for s in spectrums] # create and train model model = gensim.models.Word2Vec([d.words for d in documents], size=5, min_count=1) model.train([d.words for d in documents], total_examples=len(documents), epochs=20) # define similarity_function spec2vec = Spec2Vec(model=model, documents=documents) references = documents[:26] queries = documents[25:] # calculate scores on all combinations of references and queries scores = list(calculate_scores(references, queries, spec2vec)) # filter out self-comparisons filtered = [(reference, query, score) for (reference, query, score) in scores if reference != query] sorted_by_score = sorted(filtered, key=lambda elem: elem[2], reverse=True) actual_top10 = sorted_by_score[:10] actual_scores = [score for (reference, query, score) in actual_top10] assert max(actual_scores) > 0.99
def main(argv): parser = argparse.ArgumentParser( description="Compute MSP similarity scores") parser.add_argument("-s", dest="symmetric", action='store_true', help="Computation is symmetric.") parser.add_argument("--ref", dest="references_filename", type=str, help="Path to reference spectra library.") parser.add_argument("--ref_format", dest="references_format", type=str, help="Reference spectra library file format.") parser.add_argument("queries_filename", type=str, help="Path to query spectra.") parser.add_argument("queries_format", type=str, help="Query spectra file format.") parser.add_argument("similarity_metric", type=str, help='Metric to use for matching.') parser.add_argument("tolerance", type=float, help="Tolerance to use for peak matching.") parser.add_argument( "mz_power", type=float, help="The power to raise mz to in the cosine function.") parser.add_argument( "intensity_power", type=float, help="The power to raise intensity to in the cosine function.") parser.add_argument("output_filename_scores", type=str, help="Path where to store the output .tsv scores.") parser.add_argument("output_filename_matches", type=str, help="Path where to store the output .tsv matches.") args = parser.parse_args() if args.queries_format == 'msp': queries_spectra = list(load_from_msp(args.queries_filename)) elif args.queries_format == 'mgf': queries_spectra = list(load_from_mgf(args.queries_filename)) else: raise ValueError( f'File format {args.queries_format} not supported for query spectra.' ) if args.symmetric: reference_spectra = [] else: if args.references_format == 'msp': reference_spectra = list(load_from_msp(args.references_filename)) elif args.references_format == 'mgf': reference_spectra = list(load_from_mgf(args.references_filename)) else: raise ValueError( f'File format {args.references_format} not supported for reference spectra library.' ) if args.similarity_metric == 'CosineGreedy': similarity_metric = CosineGreedy(args.tolerance, args.mz_power, args.intensity_power) elif args.similarity_metric == 'CosineHungarian': similarity_metric = CosineHungarian(args.tolerance, args.mz_power, args.intensity_power) elif args.similarity_metric == 'ModifiedCosine': similarity_metric = ModifiedCosine(args.tolerance, args.mz_power, args.intensity_power) reference_spectra = list(map(convert_precursor_mz, reference_spectra)) queries_spectra = list(map(convert_precursor_mz, queries_spectra)) else: return -1 print("Calculating scores...") scores = calculate_scores( references=queries_spectra if args.symmetric else reference_spectra, queries=queries_spectra, similarity_function=similarity_metric, is_symmetric=args.symmetric) write_outputs(args, scores) return 0
def main(argv): parser = argparse.ArgumentParser( description="Compute MSP similarity scores") parser.add_argument("--spectra", type=str, required=True, help="Mass spectra file to be filtered.") parser.add_argument("--spectra_format", type=str, required=True, help="Format of spectra file.") parser.add_argument("--output", type=str, required=True, help="Filtered mass spectra file.") parser.add_argument( "-normalise_intensities", action='store_true', help="Normalize intensities of peaks (and losses) to unit height.") parser.add_argument( "-default_filters", action='store_true', help= "Collection of filters that are considered default and that do no require any (factory) arguments." ) parser.add_argument( "-clean_metadata", action='store_true', help= "Apply all adding and cleaning filters if possible, so that the spectra have canonical metadata." ) parser.add_argument( "-relative_intensity", action='store_true', help= "Keep only peaks within set relative intensity range (keep if to_intensity >= intensity >= from_intensity)." ) parser.add_argument("--from_intensity", type=float, help="Lower bound for intensity filter") parser.add_argument("--to_intensity", type=float, help="Upper bound for intensity filter") parser.add_argument( "-mz_range", action='store_true', help= "Keep only peaks between set m/z range (keep if to_mz >= m/z >= from_mz)." ) parser.add_argument("--from_mz", type=float, help="Lower bound for m/z filter") parser.add_argument("--to_mz", type=float, help="Upper bound for m/z filter") args = parser.parse_args() if not (args.normalise_intensities or args.default_filters or args.clean_metadata or args.relative_intensity or args.mz_range): raise ValueError('No filter selected.') if args.spectra_format == 'msp': spectra = list(load_from_msp(args.spectra)) elif args.queries_format == 'mgf': spectra = list(load_from_mgf(args.spectra)) else: raise ValueError( f'File format {args.spectra_format} not supported for mass spectra file.' ) filtered_spectra = [] for spectrum in spectra: if args.normalise_intensities: spectrum = normalize_intensities(spectrum) if args.default_filters: spectrum = default_filters(spectrum) if args.clean_metadata: filters = [ add_compound_name, add_precursor_mz, add_fingerprint, add_losses, add_parent_mass, add_retention_index, add_retention_time, clean_compound_name ] for metadata_filter in filters: spectrum = metadata_filter(spectrum) if args.relative_intensity: spectrum = select_by_relative_intensity(spectrum, args.from_intensity, args.to_intensity) if args.mz_range: spectrum = select_by_mz(spectrum, args.from_mz, args.to_mz) filtered_spectra.append(spectrum) if args.spectra_format == 'msp': save_as_msp(filtered_spectra, args.output) else: save_as_mgf(filtered_spectra, args.output) return 0
def mgf_generator(mgf_path): return list(load_from_mgf(mgf_path))[:20]
def test_user_workflow(): def apply_my_filters(s): s = default_filters(s) s = add_parent_mass(s) s = normalize_intensities(s) s = select_by_relative_intensity(s, intensity_from=0.0, intensity_to=1.0) s = select_by_mz(s, mz_from=0, mz_to=1000) s = require_minimum_number_of_peaks(s, n_required=5) return s module_root = os.path.join(os.path.dirname(__file__), '..') spectrums_file = os.path.join(module_root, 'tests', 'pesticides.mgf') # apply my filters to the data spectrums = [apply_my_filters(s) for s in load_from_mgf(spectrums_file)] # omit spectrums that didn't qualify for analysis spectrums = [s for s in spectrums if s is not None] # this will be a library grouping analysis, so queries = references = spectrums queries = spectrums[:] references = spectrums[:] # define similarity function cosine_greedy = CosineGreedy() # calculate_scores scores = list(calculate_scores(references, queries, cosine_greedy)) # filter out self-comparisons, require at least 20 matching peaks: filtered = [(reference, query, score, n_matching) for (reference, query, score, n_matching) in scores if reference != query and n_matching >= 20] sorted_by_score = sorted(filtered, key=lambda elem: elem[2], reverse=True) actual_top10 = sorted_by_score[:10] expected_top10 = [ (references[48], queries[50], pytest.approx(0.9994510368270997, rel=1e-9), 25), (references[50], queries[48], pytest.approx(0.9994510368270997, rel=1e-9), 25), (references[46], queries[48], pytest.approx(0.9981252309590571, rel=1e-9), 27), (references[48], queries[46], pytest.approx(0.9981252309590571, rel=1e-9), 27), (references[46], queries[50], pytest.approx(0.9979632203390496, rel=1e-9), 22), (references[50], queries[46], pytest.approx(0.9979632203390496, rel=1e-9), 22), (references[73], queries[74], pytest.approx(0.9956795920716246, rel=1e-9), 23), (references[74], queries[73], pytest.approx(0.9956795920716246, rel=1e-9), 23), (references[57], queries[59], pytest.approx(0.9886557001269415, rel=1e-9), 46), (references[59], queries[57], pytest.approx(0.9886557001269415, rel=1e-9), 46), ] assert actual_top10 == expected_top10
def main(): parser = argparse.ArgumentParser(description='Creating Spec2Vec Pairs') parser.add_argument('input_mgf', help='input_mgf') parser.add_argument('output_pairs', help='output_pairs') parser.add_argument('model_file', help='model_file') parser.add_argument('--min_score', type=float, default=0.7, help='model_file') args = parser.parse_args() spectra = load_from_mgf(args.input_mgf) filtered_spectra = [post_process(s) for s in spectra] # Omit spectrums that didn't qualify for analysis filtered_spectra = [s for s in filtered_spectra if s is not None] # Create spectrum documents query_documents = [ SpectrumDocument(s, n_decimals=2) for s in filtered_spectra ] #DEBUG #query_documents = query_documents[:100] # Loading the model model = gensim.models.Word2Vec.load(args.model_file) # Define similarity_function spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5, allowed_missing_percentage=80.0) print("total documents", len(query_documents)) scores = calculate_scores(query_documents, query_documents, spec2vec).scores number_of_spectra = len(query_documents) output_scores_list = [] for i in range(number_of_spectra): for j in range(number_of_spectra): if i <= j: continue i_spectrum = filtered_spectra[i] j_spectrum = filtered_spectra[j] sim = scores[i][j] if sim < args.min_score: continue score_dict = {} score_dict["filename"] = args.input_mgf score_dict["CLUSTERID1"] = i_spectrum.metadata["scans"] score_dict["CLUSTERID2"] = j_spectrum.metadata["scans"] score_dict["Cosine"] = sim score_dict["mz1"] = i_spectrum.metadata["pepmass"][0] score_dict["mz2"] = j_spectrum.metadata["pepmass"][0] score_dict["DeltaMZ"] = score_dict["mz2"] - score_dict["mz1"] score_dict["EdgeAnnotation"] = "Spec2Vec" output_scores_list.append(score_dict) # Saving Data Out pd.DataFrame(output_scores_list).to_csv(args.output_pairs, sep="\t", index=False)
def test_user_workflow_spec2vec_parallel(): """Test typical user workflow to get from mass spectra to spec2vec similarities. This test will run a typical workflow example using a small dataset and a pretrained word2vec model. One main aspect of this is to test if users will get exactly the same spec2vec similarity scores when starting from a word2vec model that was trained and saved elsewhere. """ def apply_my_filters(s): """This is how a user would typically design his own pre- and post- processing pipeline.""" s = default_filters(s) s = add_parent_mass(s) s = normalize_intensities(s) s = reduce_to_number_of_peaks(s, n_required=10, ratio_desired=0.5) s = select_by_mz(s, mz_from=0, mz_to=1000) s = add_losses(s, loss_mz_from=10.0, loss_mz_to=200.0) s = require_minimum_number_of_peaks(s, n_required=5) return s repository_root = os.path.join(os.path.dirname(__file__), "..") spectrums_file = os.path.join(repository_root, "tests", "pesticides.mgf") # apply my filters to the data spectrums = [apply_my_filters(s) for s in load_from_mgf(spectrums_file)] # omit spectrums that didn't qualify for analysis spectrums = [s for s in spectrums if s is not None] documents = [SpectrumDocument(s) for s in spectrums] model_file = os.path.join(repository_root, "integration-tests", "test_user_workflow_spec2vec.model") if os.path.isfile(model_file): model = gensim.models.Word2Vec.load(model_file) else: # create and train model model = gensim.models.Word2Vec([d.words for d in documents], size=10, min_count=1) model.train([d.words for d in documents], total_examples=len(documents), epochs=20) model.save(model_file) # define similarity_function spec2vec = Spec2VecParallel(model=model, intensity_weighting_power=0.5) references = documents[:26] queries = documents[25:] # calculate scores on all combinations of references and queries scores = list(calculate_scores_parallel(references, queries, spec2vec)) # filter out self-comparisons filtered = [(reference, query, score) for (reference, query, score) in scores if reference != query] sorted_by_score = sorted(filtered, key=lambda elem: elem[2], reverse=True) actual_top10 = sorted_by_score[:10] expected_top10 = [(documents[19], documents[25], pytest.approx(0.9999121928249473, rel=1e-9)), (documents[20], documents[25], pytest.approx(0.9998846890269892, rel=1e-9)), (documents[20], documents[45], pytest.approx(0.9998756073673759, rel=1e-9)), (documents[25], documents[45], pytest.approx(0.9998750427994474, rel=1e-9)), (documents[19], documents[27], pytest.approx(0.9998722768460854, rel=1e-9)), (documents[22], documents[27], pytest.approx(0.9998633023352553, rel=1e-9)), (documents[18], documents[27], pytest.approx(0.9998616961532616, rel=1e-9)), (documents[19], documents[45], pytest.approx(0.9998528723697396, rel=1e-9)), (documents[14], documents[71], pytest.approx(0.9998404364805897, rel=1e-9)), (documents[20], documents[27], pytest.approx(0.9998336807761137, rel=1e-9))] assert actual_top10 == expected_top10, "Expected different top 10 table."
print(args) if (args.matchms): print('use matchms') import matchms from matchms.importing import load_from_mgf from matchms.filtering import default_filters from matchms.filtering import normalize_intensities from matchms.filtering import add_precursor_mz from matchms import calculate_scores from matchms.similarity import ModifiedCosine from msmolnet.use_matchms import convert_matches as convert input_mgf = f'{args.input}.mgf' print(f"reading file {input_mgf}") file = load_from_mgf(input_mgf) print(file) print("normalising intensities") # Apply filters to clean and enhance each spectrum spectrums = [] for spectrum in file: spectrum = default_filters(spectrum) # Scale peak intensities to maximum of 1 spectrum = normalize_intensities(spectrum) print(spectrum.get('precursor_mz')) spectrums.append(spectrum) scores = calculate_scores( references=spectrums,