def test_spectra_merging__EAX000402(self):
        """
        We compare our spectra merging with the strategy applied in [1] and originally proposed in [2].

        References:
            [1] "MetFrag relaunched: incorporating strategies beyond in silico fragmentation" by Ruttkies et al. (2016)
            [2] "Alignment of high resolution mass spectra: development of a heuristic approach for metabolomics" by
                Kazmi et al. (2006)
        """
        # self.skipTest("Intensities are wired in the original file: EAX000402")

        # Load the list of spectra to merge: EA0004[56][0-9].txt --> EAX000402.txt
        spectra = []
        for mb_fn in glob.iglob(os.path.join(os.path.dirname(__file__), "example_massbank_records", "EA0004[56][0-9].txt")):
            spectra.append(MBSpectrum(mb_fn))

        # Run the spectra merging using hierarchical clustering
        merged_spectrum = MBSpectrum.merge_spectra(spectra)  # type: MBSpectrum

        # Merged spectrum as used by [1]
        peaks_ref = [
            (117.0347000, 999),
            (186.0677308, 999)
        ]

        mzs_ref = list(zip(*peaks_ref))[0]
        ints_ref = np.array(list(zip(*peaks_ref))[1])

        np.testing.assert_almost_equal(mzs_ref, merged_spectrum.get_mz())
        np.testing.assert_almost_equal(ints_ref / 999, merged_spectrum.get_int())
    def test_spectra_merging__EAX281502(self):
        """
        We compare our spectra merging with the strategy applied in [1] and originally proposed in [2].

        References:
            [1] "MetFrag relaunched: incorporating strategies beyond in silico fragmentation" by Ruttkies et al. (2016)
            [2] "Alignment of high resolution mass spectra: development of a heuristic approach for metabolomics" by
                Kazmi et al. (2006)
        """
        # Load the list of spectra to merge: EA2815[56][0-9].txt --> EAX281502.txt
        spectra = []
        for mb_fn in glob.iglob(os.path.join(os.path.dirname(__file__), "example_massbank_records", "EA2815[56][0-9].txt")):
            spectra.append(MBSpectrum(mb_fn))

        # Run the spectra merging using hierarchical clustering
        merged_spectrum = MBSpectrum.merge_spectra(spectra)  # type: MBSpectrum

        # Merged spectrum as used by [1]
        peaks_ref = [
            (81.0220667, 257.0000000),
            (96.0095000, 226.0000000),
            (109.0170500, 259.0000000),
            (111.0198667, 290.0000000),
            (116.0504625, 559.0000000),
            (118.0663111, 999.0000000),
            (126.1288286, 348.0000000),
            (156.0821000, 84.0000000),
            (170.1188000, 2.0000000),
            (172.0770500, 54.0000000),
            (174.0561667, 114.0000000),
            (182.1189250, 95.0000000),
            (197.1296571, 173.0000000),
            (200.0717900, 999.0000000),
            (227.1402000, 67.0000000),
            (230.1551000, 290.0000000),
            (244.0616500, 15.0000000),
            (257.2023400, 218.0000000),
            (273.1971500, 176.0000000),
            (276.0872500, 3.0000000),
            (283.1819500, 70.0000000),
            (285.1611800, 381.0000000),
            (301.1921875, 459.0000000),
            (317.1871200, 142.0000000),
            (327.1710000, 128.0000000),
            (331.2034500, 3.0000000),
            (345.1821167, 999.0000000),
            (359.1974333, 177.0000000),
            (377.2083750, 999.0000000)
        ]

        mzs_ref = list(zip(*peaks_ref))[0]
        ints_ref = np.array(list(zip(*peaks_ref))[1])

        np.testing.assert_almost_equal(mzs_ref, merged_spectrum.get_mz())
        np.testing.assert_almost_equal(ints_ref / 999, merged_spectrum.get_int(), decimal=3)
 def test_to_sirius__custom_candidate_db(self):
     # Spectrum 1 -------------------
     spec = MBSpectrum(os.path.join(os.path.dirname(__file__), "example_massbank_records", "FIO00665.txt"))
     out = spec.to_sirius_format(
         molecular_candidates=pd.read_csv(
             os.path.join(os.path.dirname(__file__), "example_massbank_records", "FIO00665.tsv"), sep="\t"
         )
     )
     self.assertIn("FIO00665.ms", out)
     self.assertIn("FIO00665.tsv", out)
     self.assertIsNotNone(out["FIO00665.tsv"])
    def test_metainformation_merging__FIO00665(self):
        # Apply merge function to a single spectrum
        mb_fn = os.path.join(os.path.dirname(__file__), "example_massbank_records", "FIO00665.txt")
        spec = MBSpectrum(mb_fn)
        spectra = [spec]
        acc_ref = [os.path.basename(mb_fn).split(".")[0]]
        rt_ref = spec.get("retention_time")
        precmz_ref = spec.get("precursor_mz")
        recordtitle_ref = spec.get("record_title")
        ce_ref = spec.get("collision_energy")

        # -------------------
        # WITH RT AGGREGATION
        # -------------------
        merged_spectrum = MBSpectrum.merge_spectra(spectra, rt_agg_fun=np.min)  # type: MBSpectrum

        self.assertEqual("FBZONXHGGPHHIY-UHFFFAOYSA-N", merged_spectrum.get("inchikey"))
        self.assertEqual(acc_ref, merged_spectrum.get("original_accessions"))
        self.assertEqual("FIO", merged_spectrum.get("accession")[:3])
        self.assertEqual(MBSpectrum._get_new_accession_id(merged_spectrum.get("original_accessions")),
                         merged_spectrum.get("accession"))
        self.assertEqual(rt_ref, merged_spectrum.get("retention_time"))
        self.assertEqual(precmz_ref, merged_spectrum.get("precursor_mz"))
        self.assertEqual(recordtitle_ref, merged_spectrum.get("record_title"))
        self.assertEqual([ce_ref], merged_spectrum.get("collision_energy"))
    def test_spectra_merging__EAX000401(self):
        """
        We compare our spectra merging with the strategy applied in [1] and originally proposed in [2].

        References:
            [1] "MetFrag relaunched: incorporating strategies beyond in silico fragmentation" by Ruttkies et al. (2016)
            [2] "Alignment of high resolution mass spectra: development of a heuristic approach for metabolomics" by
                Kazmi et al. (2006)
        """
        # Load the list of spectra to merge: EA0004[01][0-9].txt --> EAX000401.txt
        spectra = []
        for mb_fn in glob.iglob(os.path.join(os.path.dirname(__file__), "example_massbank_records", "EA0004[01][0-9].txt")):
            spectra.append(MBSpectrum(mb_fn))

        # Run the spectra merging using hierarchical clustering
        merged_spectrum = MBSpectrum.merge_spectra(spectra)  # type: MBSpectrum

        # Merged spectrum as used by [1]
        peaks_ref = [
            (53.03852, 55),
            (57.0447285714286, 93),
            (65.0386, 116),
            (77.0386, 884),
            (81.03345, 12),
            (85.0396545454545, 17),
            (91.0542714285714, 123),
            (92.0494875, 377),
            (95.04925, 112),
            (103.041733333333, 15),
            (104.049541666667, 999),
            (105.044757142857, 271),
            (105.069975, 12),
            (110.060033333333, 7),
            (119.060475, 631),
            (130.04005, 11),
            (130.0652, 49),
            (131.07295, 24),
            (142.0652, 9),
            (147.0554, 3),
            (160.087069230769, 999),
            (188.082038461538, 999)
        ]

        mzs_ref = list(zip(*peaks_ref))[0]
        ints_ref = np.array(list(zip(*peaks_ref))[1])

        np.testing.assert_almost_equal(mzs_ref, merged_spectrum.get_mz())
        np.testing.assert_almost_equal(ints_ref / 999, merged_spectrum.get_int(), decimal=3)
    def test_rt_sanitizer(self):
        out = MBSpectrum._sanitize_meta_information({"retention_time": ("430", "")})
        self.assertEqual(430, out["retention_time"])
        self.assertEqual(None, out["retention_time_unit"])

        out = MBSpectrum._sanitize_meta_information({"retention_time": ("430.3", "min")})
        self.assertEqual(430.3, out["retention_time"])
        self.assertEqual("min", out["retention_time_unit"])

        out = MBSpectrum._sanitize_meta_information({"retention_time": ("32", "s")})
        self.assertEqual(32, out["retention_time"])
        self.assertEqual("sec", out["retention_time_unit"])

        out = MBSpectrum._sanitize_meta_information({"retention_time": ("32", "m")})
        self.assertEqual(32, out["retention_time"])
        self.assertEqual("min", out["retention_time_unit"])
    def test_metainformation_merging__EAX000401(self):
        # Load the list of spectra to merge: EA0004[01][0-9].txt --> EAX000401.txt
        spectra = []
        acc_ref = []
        rt_ref = []
        precmz_ref = []
        recordtitle_ref = []
        ce_ref = []
        for mb_fn in glob.iglob(os.path.join(os.path.dirname(__file__), "example_massbank_records", "EA0004[01][0-9].txt")):
            spectra.append(MBSpectrum(mb_fn))

            # collect some reference meta-information
            acc_ref.append(os.path.basename(mb_fn).split(".")[0])
            rt_ref.append(spectra[-1].get("retention_time"))
            precmz_ref.append(spectra[-1].get("precursor_mz"))
            recordtitle_ref.append(spectra[-1].get("record_title"))
            ce_ref.append(spectra[-1].get("collision_energy"))

        # -------------------
        # WITH RT AGGREGATION
        # -------------------
        merged_spectrum = MBSpectrum.merge_spectra(spectra, rt_agg_fun=np.mean)  # type: MBSpectrum

        self.assertEqual("OUSYWCQYMPDAEO-UHFFFAOYSA-N", merged_spectrum.get("inchikey"))
        self.assertEqual(acc_ref, merged_spectrum.get("original_accessions"))
        self.assertEqual("EA", merged_spectrum.get("accession")[:2])
        self.assertEqual(MBSpectrum._get_new_accession_id(merged_spectrum.get("original_accessions")),
                         merged_spectrum.get("accession"))
        self.assertEqual("min", merged_spectrum.get("retention_time_unit"))
        self.assertEqual(np.mean(rt_ref), merged_spectrum.get("retention_time"))
        self.assertEqual(precmz_ref[0], merged_spectrum.get("precursor_mz"))
        self.assertEqual(recordtitle_ref, merged_spectrum.get("record_title"))
        self.assertEqual(ce_ref, merged_spectrum.get("collision_energy"))

        # ----------------------
        # WITHOUT RT AGGREGATION
        # ----------------------
        merged_spectrum = MBSpectrum.merge_spectra(spectra, rt_agg_fun=None)  # type: MBSpectrum

        self.assertEqual("min", merged_spectrum.get("retention_time_unit"))
        self.assertEqual(rt_ref, merged_spectrum.get("retention_time"))
    def test_peak_parsing(self):
        # Spectrum 1
        peaks = [(65.0388, 454422.7),
                 (105.0703, 3594629.8),
                 (115.0544, 916283.5),
                 (121.0287, 482029.8),
                 (139.0544, 904286),
                 (163.0549, 764499.2),
                 (164.0627, 2677649.4),
                 (165.0706, 293933491.2),
                 (166.0782, 235412982.7),
                 (183.0812, 3256920.2),
                 (193.0767, 1661789.6),
                 (199.0315, 5538869.4),
                 (201.0474, 7384944.1)]
        spec = MBSpectrum(os.path.join(os.path.dirname(__file__), "example_massbank_records", "EQ308406.txt"))
        self.assertEqual(peaks, spec.get_peaks())

        # Spectrum 2
        peaks = [
            (116.050500, 2998.000000),
            (117.054700, 236.000000),
            (118.029100, 170.000000),
            (131.037600, 1.23552e13),
            (132.045500, 1241.000000),
            (133.048300, 116.000000),
            (159.032400, 1732.000000),
            (160.040200, 10392.000000),
            (161.043200, 1399.000000)]
        spec = MBSpectrum(os.path.join(os.path.dirname(__file__), "example_massbank_records", "FIO00665.txt"))
        self.assertEqual(peaks, spec.get_peaks())
 def test_to_sirius__gt_molecular_formula(self):
     acc = "EQ308406"
     self.assertIn(
         "#formula",
         MBSpectrum(os.path.join(os.path.dirname(__file__), "example_massbank_records", "%s.txt" % acc))
             .to_sirius_format()[acc + ".ms"]
     )
     self.assertNotIn(
         ">formula",
         MBSpectrum(os.path.join(os.path.dirname(__file__), "example_massbank_records", "%s.txt" % acc))
             .to_sirius_format()[acc + ".ms"]
     )
     self.assertNotIn(
         "#formula",
         MBSpectrum(os.path.join(os.path.dirname(__file__), "example_massbank_records", "%s.txt" % acc))
             .to_sirius_format(add_gt_molecular_formula=True)[acc + ".ms"]
     )
     self.assertIn(
         ">formula",
         MBSpectrum(os.path.join(os.path.dirname(__file__), "example_massbank_records", "%s.txt" % acc))
             .to_sirius_format(add_gt_molecular_formula=True)[acc + ".ms"]
     )
Beispiel #10
0
def _run_parser():
    import os

    from glob import glob

    from massbank2db.spectrum import MBSpectrum

    for msfn in sorted(
            glob(
                "/home/bach/Documents/doctoral/data/MassBank-data_bachi55/ISAS_Dortmund/IA*.txt"
            )):
        # Read all meta information from the MS-file
        try:
            spec = MBSpectrum(msfn)
        except AssertionError:
            print(os.path.basename(msfn))
    def test_to_sirius(self):
        # Spectrum 1 -------------------
        spec = MBSpectrum(os.path.join(os.path.dirname(__file__), "example_massbank_records", "FIO00665.txt"))
        out = spec.to_sirius_format()
        self.assertIn("FIO00665.ms", out)
        self.assertIn("FIO00665.tsv", out)
        self.assertIsNone(out["FIO00665.tsv"])
        self.assertIn(">profile qtof", out["FIO00665.ms"])

        # check fragmentation peaks
        tmp = out["FIO00665.ms"].split("\n")
        for idx, _peak in enumerate(spec.get_peaks(), start=tmp.index(">ms2merged") + 1):
            _mz, _int = tmp[idx].split(" ")
            self.assertEqual(_peak, (float(_mz), float(_int)))

        # Spectrum 2 -------------------
        spec = MBSpectrum(os.path.join(os.path.dirname(__file__), "example_massbank_records", "EQ308406.txt"))
        out = spec.to_sirius_format()
        self.assertIn("EQ308406.ms", out)
        self.assertIn("EQ308406.tsv", out)
        self.assertIsNone(out["EQ308406.tsv"])
        self.assertIn(">profile orbitrap", out["EQ308406.ms"])

        # check fragmentation peaks
        tmp = out["EQ308406.ms"].split("\n")
        for idx, _peak in enumerate(spec.get_peaks(), start=tmp.index(">ms2merged") + 1):
            _mz, _int = tmp[idx].split(" ")
            self.assertEqual(_peak, (float(_mz), float(_int)))

        # Spectrum 3 --------------------
        spectra = []
        acc = []
        spec_cnt = 0
        original_accessions = [
            'EA000412', 'EA000414', 'EA000401', 'EA000413', 'EA000408', 'EA000409', 'EA000405', 'EA000402', 'EA000406',
            'EA000404', 'EA000411', 'EA000403', 'EA000407', 'EA000410'
        ]
        for oacc in original_accessions:
            mb_fn = os.path.join(os.path.dirname(__file__), "example_massbank_records", "%s.txt" % oacc)
            spectra.append(MBSpectrum(mb_fn))
            acc.append(spectra[-1].get("accession"))
            spec_cnt += 1

        self.assertIn(
            ">ms2merged",
            MBSpectrum.merge_spectra(spectra, merge_peak_lists=True).to_sirius_format()["EA33002987.ms"]
        )

        self.assertEqual(
            spec_cnt,
            MBSpectrum.merge_spectra(spectra, merge_peak_lists=False).to_sirius_format()["EA33002987.ms"].count(">ms2peaks")
        )
    def test_cfmid_output_parser(self):
        # Reference peaks for the example spectrum 2931.txt
        peaks_ref = [
            [
                (243.07, 6.2),
                (271.06010, 100.0)
            ],
            [
                (161.02332, 48.5),
                (163.03897, 24.9),
                (215.07027, 17.3),
                (243.06519, 25.8),
                (253.04954, 15.0),
                (271.06010, 100.0)
            ],
            [
                (65.03858, 39.7),
                (75.02293, 17.6),
                (77.03858, 34.4),
                (93.03349, 20.4),
                (109.02841, 25.7),
                (111.04406, 49.2),
                (113.05971, 19.1),
                (121.02841, 46.3),
                (123.04406, 23.2),
                (131.01276, 34.6),
                (133.02841, 33.4),
                (135.04406, 22.4),
                (137.02332, 100.0),
                (161.02332, 65.0),
                (163.03897, 26.2),
                (177.01824, 25.9),
                (187.03897, 21.4),
                (189.05462, 23.3),
                (197.02332, 18.0),
                (201.05462, 25.9),
                (211.03897, 38.6),
                (213.05462, 82.4),
                (215.07027, 30.6),
                (225.05462, 71.4),
                (241.04954, 15.4),
                (243.06519, 17.8),
                (253.04954, 17.9)
            ]
        ]

        # --- Load the insilico spectrum (each energy separately)
        spec = MBSpectrum.from_cfmid_output(
            os.path.join(os.path.dirname(__file__), "example_cfmid_outputs", "2931.txt"),
            cfmid_4_format=True, merge_energies=False
        )

        for i in range(3):
            self.assertEqual("ID2931%d" % i, spec[i].get("accession"))
            self.assertEqual("energy%d" % i, spec[i].get("collision_energy"))

        for i in range(3):
            self.assertEqual(len(peaks_ref[i]), len(spec[i].get_peaks()))
            self.assertListEqual(peaks_ref[i], spec[i].get_peaks())

        # --- Load the insilico spectrum and merge the energies into a single spectrum
        spec = MBSpectrum.from_cfmid_output(
            os.path.join(os.path.dirname(__file__), "example_cfmid_outputs", "2931.txt"),
            cfmid_4_format=True, merge_energies=True
        )

        self.assertIsInstance(spec, MBSpectrum)
        self.assertListEqual(["ID2931%d" % i for i in range(3)], spec.get("original_accessions"))

        # Peak only appears in one energy
        self.assertIn((65.03858, 39.7 / 100), spec.get_peaks())

        # Peak that appears in multiple energies
        self.assertIn((161.02332, 65.0 / 100), spec.get_peaks())
    def test_to_metfrag(self):
        # Spectrum 1 --------------------
        out = MBSpectrum(os.path.join(os.path.dirname(__file__), "example_massbank_records", "FIO00665.txt")).to_metfrag_format(
            **{"MetFragScoreWeights": [0.8, 0.2],
               "MetFragScoreTypes": ["FragmenterScore", "PubChemNumberPatents"],
               "LocalDatabasePath": "/path/to/db",
               "ResultsPath": "/path/to/results",
               "NumberThreads": 4,
               "PeakListPath": "/path/to/peaks"}
        )

        self.assertIn("FIO00665.peaks", out)
        self.assertIn("FIO00665.conf", out)
        self.assertIn("PeakListPath=/path/to/peaks/FIO00665.peaks", out["FIO00665.conf"])
        self.assertIn("MetFragScoreWeights=0.8,0.2\n", out["FIO00665.conf"])
        self.assertIn("MetFragScoreTypes=FragmenterScore,PubChemNumberPatents\n", out["FIO00665.conf"])
        self.assertIn("PrecursorIonMode=-1\n", out["FIO00665.conf"])
        self.assertIn("IsPositiveIonMode=False\n", out["FIO00665.conf"])

        # Spectrum 2 --------------------
        out = MBSpectrum(os.path.join(os.path.dirname(__file__), "example_massbank_records", "EQ308406.txt")).to_metfrag_format(
            **{"MetFragScoreWeights": [1.0],
               "MetFragScoreTypes": ["FragmenterScore"],
               "LocalDatabasePath": "/path/to/db",
               "ResultsPath": "/path/to/results",
               "NumberThreads": 4,
               "PeakListPath": "/path/to/peaks"}
        )

        self.assertIn("EQ308406.peaks", out)
        self.assertIn("EQ308406.conf", out)
        self.assertIn("PeakListPath=/path/to/peaks/EQ308406.peaks", out["EQ308406.conf"])
        self.assertIn("MetFragScoreWeights=1.0\n", out["EQ308406.conf"])
        self.assertIn("MetFragScoreTypes=FragmenterScore\n", out["EQ308406.conf"])
        self.assertIn("PrecursorIonMode=1\n", out["EQ308406.conf"])
        self.assertIn("IsPositiveIonMode=True\n", out["EQ308406.conf"])

        # Spectrum 3 --------------------
        spectra = []
        acc = []
        for mb_fn in glob.iglob(os.path.join(os.path.dirname(__file__), "example_massbank_records", "EA0004[01][0-9].txt")):
            spectra.append(MBSpectrum(mb_fn))
            acc.append(spectra[-1].get("accession"))

        merged_spectrum = MBSpectrum.merge_spectra(spectra)
        out = merged_spectrum.to_metfrag_format(
            **{"MetFragScoreWeights": [1.0],
               "MetFragScoreTypes": ["FragmenterScore"],
               "LocalDatabasePath": "/path/to/db",
               "ResultsPath": "/path/to/results",
               "NumberThreads": 4,
               "PeakListPath": "/path/to/peaks"}
        )

        peaks_fn = merged_spectrum.get("accession") + ".peaks"
        config_fn = merged_spectrum.get("accession") + ".conf"

        self.assertIn("PeakListPath=/path/to/peaks/" + peaks_fn, out[config_fn])
        self.assertIn("MetFragScoreWeights=1.0\n", out[config_fn])
        self.assertIn("MetFragScoreTypes=FragmenterScore\n", out[config_fn])
        self.assertIn("PrecursorIonMode=1\n", out[config_fn])
        self.assertIn("IsPositiveIonMode=True\n", out[config_fn])