Ejemplo n.º 1
0
    def test_invalid_spectra(self):
        with self.assertRaises(ValueError):
            Spectrum('', SpectrumType.MS)

        with self.assertRaises(ValueError):
            Spectrum([], SpectrumType.MS)

        with self.assertRaises(ValueError):
            Spectrum('1 10 2 5 3 5', SpectrumType.MS)
Ejemplo n.º 2
0
def create_splash(input_file, output_file, separator, spectrum_type, spectrum_col, origin_col):
    start_time = time.time()
    splasher = Splash()

    with open(input_file, 'r') as f, \
        (open(output_file, 'w') if output_file is not None else sys.stdout) as fout:

        for i, line in enumerate(f):
            # Handle input
            line = line.strip().split(separator)

            origin = line[origin_col - 1]
            spectrum_string = line[spectrum_col - 1]

            spectrum = Spectrum(spectrum_string, spectrum_type)
            splash_code = splasher.splash(spectrum)

            # Print the spectrum id with the calculated splash id
            print(splash_code, *line, sep = separator, file = fout)

            if (i + 1) % 10000 == 0:
                print('processed %d spectra, %.2f ms average time to splash a spectrum' % (i + 1, 1000 * (time.time() - start_time) / (i + 1)), file = sys.stderr)

    print('finished processing, processing took: %.2f s' % (time.time() - start_time), file = sys.stderr)
    print('processed %d spectra' % (i + 1), file = sys.stderr)
    print('average time including io to splash a spectra is %.2f ms' % (1000 * (time.time() - start_time) / (i + 1)), file = sys.stderr)
Ejemplo n.º 3
0
    def test_spectrum_3(self):
        spectrum_string = '85:1095.0 86:185.0 87:338.0 88:20.0 89:790.0 91:7688.0 92:1208.0 93:5630.0 94:838.0 95:7354.0 96:661.0 97:1634.0 99:410.0 101:712.0 102:182.0 103:1080.0 104:304.0 105:7310.0 106:1494.0 107:4463.0 108:768.0 109:2996.0 110:466.0 111:771.0 112:27.0 113:317.0 114:66.0 115:1485.0 116:673.0 117:2789.0 118:719.0 119:4687.0 120:2294.0 121:4237.0 122:489.0 123:1162.0 124:120.0 125:335.0 126:204.0 128:1042.0 129:15126.0 130:2142.0 131:3635.0 132:966.0 133:2697.0 134:681.0 135:1701.0 136:205.0 137:360.0 139:199.0 141:544.0 142:147.0 143:2163.0 144:394.0 145:3924.0 146:869.0 147:1624.0 148:615.0 149:1321.0 150:88.0 151:373.0 152:109.0 153:268.0 154:84.0 155:770.0 156:266.0 157:859.0 158:486.0 159:2227.0 160:1288.0 161:1723.0 162:387.0 163:949.0 164:178.0 165:359.0 166:52.0 167:62.0 168:118.0 169:352.0 170:79.0 171:675.0 172:238.0 173:1051.0 174:364.0 175:465.0 176:140.0 177:434.0 178:137.0 179:218.0 180:65.0 181:195.0 182:77.0 183:215.0 184:104.0 185:341.0 186:141.0 187:327.0 188:104.0 189:376.0 190:42.0 191:261.0 193:13.0 195:79.0 196:99.0 197:70.0 198:120.0 199:378.0 200:216.0 201:267.0 202:67.0 203:596.0 204:151.0 205:153.0 206:184.0 208:85.0 209:132.0 212:62.0 213:724.0 214:270.0 215:238.0 216:119.0 217:433.0 218:81.0 219:258.0 220:22.0 221:1.0 224:42.0 225:23.0 227:142.0 228:122.0 229:159.0 231:15.0 232:25.0 233:240.0 235:101.0 236:39.0 237:47.0 239:47.0 240:64.0 241:37.0 242:31.0 243:58.0 244:16.0 245:199.0 246:47.0 247:433.0 248:60.0 249:16.0 250:34.0 251:30.0 255:636.0 256:151.0 257:20.0 258:30.0 259:138.0 260:66.0 261:79.0 264:13.0 267:85.0 268:41.0 271:32.0 273:128.0 274:117.0 275:182.0 279:40.0 280:24.0 282:74.0 284:40.0 287:36.0 288:49.0 289:79.0 291:50.0 295:55.0 296:23.0 297:67.0 298:55.0 300:30.0 301:74.0 303:67.0 304:31.0 306:10.0 311:73.0 312:61.0 313:87.0 314:32.0 317:7.0 318:33.0 326:127.0 327:192.0 328:272.0 329:1729.0 330:473.0 331:136.0 339:47.0 340:21.0 342:30.0 349:32.0 351:29.0 353:641.0 354:300.0 355:14.0 368:1096.0 369:424.0 370:76.0 382:32.0 407:19.0 415:29.0 416:8.0 429:90.0 430:28.0 440:8.0 442:10.0 443:31.0 444:63.0 447:36.0 451:31.0 454:7.0 456:40.0 458:354.0 459:298.0 460:83.0 461:57.0 468:23.0 469:29.0 472:16.0 484:19.0 486:27.0'
        splash_code = 'splash10-056v-2900000000-f47edee35669c8f014c2'

        self.assertEqual(
            Splash().splash(Spectrum(spectrum_string, SpectrumType.MS)),
            splash_code)
Ejemplo n.º 4
0
    def test_spectrum_1(self):
        spectrum_string = '66.0463:2.1827 105.0698:7.9976 103.0541:4.5676 130.065:8.6025 93.0572:0.2544 79.0542:4.4657 91.0541:2.5671 131.0728:2.6844 115.0541:1.3542 65.0384:0.6554 94.0412:0.5614 116.0494:1.2008 95.049:2.1338 117.0572:100 89.0385:11.7808 77.0385:3.3802 90.0463:35.6373 132.0806:2.343 105.0446:1.771'
        splash_code = 'splash10-014i-4900000000-889a38f7ace2626a0435'

        self.assertEqual(
            Splash().splash(Spectrum(spectrum_string, SpectrumType.MS)),
            splash_code)
Ejemplo n.º 5
0
    def test_spectrum_2(self):
        spectrum_string = '303.07:100 662.26:1.2111 454.91:1.2023 433.25:0.8864 432.11:2.308 592.89:3.9052 259.99:0.6406 281.14:1.2549 451.34:1.1847 499.85:1.2374 482.14:2.4133 450:23.5191 483:1.0004 285.25:1.448 253.1:46.5731 254.11:3.247 259.13:6.9241 304.14:17.2795'
        splash_code = 'splash10-0udi-0049200000-ef488ecacceeaaadb4a2'

        self.assertEqual(
            Splash().splash(Spectrum(spectrum_string, SpectrumType.MS)),
            splash_code)
Ejemplo n.º 6
0
 def callback(msms: PySpectrum, file_name: str):
     if msms is not None:
         highest = msms.highest_peaks(1)[0]
         spectra = msms.convert(msms).spectra
         splash = Splash().splash(Spectrum(spectra,
                                           SpectrumType.MS))
         out.write("{};{};{};{};{};{}\n".format(
             msms.ms_level, highest[0], highest[1],
             msms.scan_time[0], splash, spectra))
Ejemplo n.º 7
0
def get_splash(peaks):

    peak_data = []
    for peak in peaks:
        row = (peak[0], peak[1]) # mz, intensity
        print row
        peak_data.append(row)

    spectrum = Spectrum(peak_data, SpectrumType.MS)
    hash = Splash().splash(spectrum)
    print hash
    return hash
Ejemplo n.º 8
0
def get_SPLASH_from_pySPLASH(ion_list: List[Tuple[float, float]]) -> str:
    """ This function will generate a SPLASH key using pySPLASH installed in
    the local env.

    Args:
        List[Tuple[float, float]]: List of mass and intensity values.
    """

    # Initialize the return
    splash_string = None
    spectra = Spectrum(ion_list, SpectrumType.MS)
    splash_string = Splash().splash(spectra)

    return splash_string
Ejemplo n.º 9
0
        def callback(msms: PySpectrum, file_name: str):
            with db.atomic() as transaction:
                if msms is None:
                    self.extract_record(file_name)

                else:
                    # 3. load sample object
                    record = MZMLSampleRecord.get(
                        MZMLSampleRecord.file_name == file_name)
                    # 3. associated msms spectra to it

                    try:
                        # 4. commit transaction
                        highest = msms.highest_peaks(1)[0]
                        spectra = msms.convert(msms, mode="raw").spectra
                        precurosr = msms.selected_precursors[0] if len(
                            msms.selected_precursors) > 0 else {}
                        scannumber = msms.index

                        splash = Splash().splash(
                            Spectrum(spectra, SpectrumType.MS))

                        spectra = MZMLMSMSSpectraRecord.create(
                            sample=record,
                            msms=spectra,
                            rt=msms.scan_time[0],
                            splash=splash,
                            scan_number=scannumber,
                            level=msms.ms_level,
                            base_peak=highest[0],
                            base_peak_intensity=highest[1],
                            precursor=precurosr['mz']
                            if 'mz' in precurosr else 0,
                            precursor_intensity=precurosr['i']
                            if 'i' in precurosr else 0,
                            precursor_charge=precurosr['charge']
                            if 'charge' in precurosr else 0,
                            ion_count=len(msms.peaks("centroided")))
                    except IndexError as e:
                        # not able to find highest peak
                        pass
Ejemplo n.º 10
0
    def test_spectrum_string(self):
        s = Spectrum("1:10 2:5 3:5", SpectrumType.MS)

        self.assertEqual(len(s.spectrum), 3)
        self.assertEqual(max(_[1] for _ in s.spectrum), 100.0)
Ejemplo n.º 11
0
    def test_spectrum_list(self):
        s = Spectrum([(1, 10), (2, 5), (3, 5)], SpectrumType.MS)

        self.assertEqual(len(s.spectrum), 3)
        self.assertEqual(max(_[1] for _ in s.spectrum), 100.0)
Ejemplo n.º 12
0
    def _encode(self,
                spec: Spectra,
                prefix: str = None,
                store_string: bool = False):
        """
        encodes the given spectra
        :param spec: spectra
        :param prefix: prefix
        :param store_string: do you also want to store the spectra string for each spectra?
        :return: an image representation of the encoded spectra in form of a string
        """
        # dumb approach to find max mz
        data = []

        pairs = spec.spectra.split(" ")

        # convert spectra to arrays
        for pair in pairs:
            try:
                mass, intensity = pair.split(":")

                frac, whole = math.modf(float(mass))

                data.append({
                    "intensity": float(intensity),
                    "mz": float(mass),
                    "nominal": int(whole),
                    "frac": round(float(frac), 4)
                })
            except ValueError:
                pass

        try:
            dataframe = pd.DataFrame(
                data, columns=["intensity", "mz", "nominal", "frac"])

            # group by 5 digits
            dataframe = dataframe.groupby(
                dataframe['mz'].apply(lambda x: round(x, 5))).sum()

            # drop data outside min and max
            dataframe = dataframe[(dataframe['nominal'] >= self.min_mz)
                                  & (dataframe['nominal'] <= self.max_mz)]

            dataframe['intensity_min_max'] = (
                dataframe['intensity'] - dataframe['intensity'].min()
            ) / (dataframe['intensity'].max() - dataframe['intensity'].min())

            # formatting
            fig = plt.figure(figsize=(self.height / self.dpi,
                                      self.width / self.dpi),
                             dpi=self.dpi)

            self._encode_dataframe(dataframe, fig)

            plt.tight_layout()
            fig.canvas.draw()

            spectra_string = fig.canvas.tostring_rgb()

            if self.directory is not None:
                name = Splash().splash(Spectrum(spec.spectra, SpectrumType.MS))
                plt.subplots_adjust(left=0, right=1, top=1, bottom=0)

                directory = self.directory if prefix is None else "{}/{}".format(
                    self.directory, prefix)
                pathlib.Path(directory).mkdir(parents=True, exist_ok=True)
                plt.savefig("{}/{}.png".format(directory, name), dpi=self.dpi)

                plt.close(fig=fig)

                if store_string:
                    with open("{}/{}.txt".format(directory, name),
                              'w') as the_file:
                        the_file.write(spec.spectra)
                return None

            return spectra_string
        except ValueError:
            pass
Ejemplo n.º 13
0
    def convert(self, pattern: str = "%%"):
        """
        converts the given input and stores it at the defined postgres database location
        :param input:
        :return:
        """

        # 1. query all visble sample information
        db = config.config(filename="database.ini", section="binbase")

        connection = psycopg2.connect(**db)

        cursor = connection.cursor()
        sample_count = connection.cursor()
        spectra = connection.cursor()
        bin = connection.cursor()

        sample_count.execute(
            "select count(*) from samples where sample_name like '{}' and visible = 'TRUE'"
            .format(pattern))
        count = sample_count.fetchone()[0]

        pbar = tqdm(total=count + 1,
                    desc="importing samples for pattern {}".format(pattern))

        cursor.execute(
            "select sample_id, sample_name from samples where sample_name like '{}' and visible = 'TRUE'"
            .format(pattern))

        row = cursor.fetchone()

        while row is not None:
            try:
                try:
                    record = MZMLSampleRecord.get(
                        MZMLSampleRecord.file_name == row[1])
                    record.delete_instance()
                except Exception:
                    # object doesn't exist
                    pass
                # 2. create sample object
                MZMLSampleRecord.create(file_name=row[1],
                                        instrument="gctof",
                                        name=row[1])

                record = MZMLSampleRecord.get(
                    MZMLSampleRecord.file_name == row[1])
                spectra.execute(
                    "select bin_id, spectra_id,spectra, retention_time from spectra where sample_id = {}"
                    .format(row[0]))

                s = spectra.fetchone()

                while s is not None:
                    splash = Splash().splash(Spectrum(s[2], SpectrumType.MS))

                    spectrum = [
                        list(map(float, x.split(':')))
                        for x in s[2].strip().split()
                    ]

                    spectrum_max = max(spectrum, key=itemgetter(1))

                    MZMLMSMSSpectraRecord.create(
                        sample=record,
                        msms=s[2],
                        rt=s[3],
                        splash=splash,
                        level=1,
                        base_peak=spectrum_max[0],
                        base_peak_intensity=spectrum_max[1],
                        precursor=0,
                        precursor_intensity=0,
                        precursor_charge=0,
                        ion_count=len(spectrum),
                        scan_number=int(s[1]))

                    if s[0] is not None:
                        DatesetToPostgresConverter.classify(
                            "bin_id", splash, s[0])
                        bin.execute(
                            "select name from bin where bin_id = {} and bin_id::text != name"
                            .format(s[0]))
                        result = bin.fetchone()
                        if result is not None:
                            DatesetToPostgresConverter.classify(
                                "bin", splash, result[0])

                    s = spectra.fetchone()

                row = cursor.fetchone()
            finally:
                pbar.update(1)
def peakprocess(beginline, endline="END", annotation=""):
    peakX = []
    peakY = []
    anno = []
    if endline == "END":
        peakinfo = data.split(beginline)[1].split("\n")[1:]
    else:
        peakinfo1 = data.split(beginline)[1].split("\n")[1:]
        peakinfo2 = "\n".join(peakinfo1).split(endline)[0]
        peakinfo = peakinfo2.split("\n")[:-1]
    if annotation != "":
        tempinfo = [peakinfo[0]]
        for i in range(1, len(peakinfo)):
            if (peakinfo[i - 1].split(annotation)[0] !=
                    peakinfo[i].split(annotation)[0]):
                tempinfo.append(peakinfo[i])
        peakinfo = tempinfo
        for i in range(len(peakinfo)):
            calpeakinfo = peakinfo[i].split(annotation)[0]
            calpeakinfo = calpeakinfo.strip()
            calpeakinfo = calpeakinfo.replace(" ", "\t")
            peakX.append(float(calpeakinfo.split("\t")[0]))
            peakY.append(float(calpeakinfo.split("\t")[1]))
        pairpeak = list(sorted(zip(peakX, peakY)))
        spectrum = Spectrum(pairpeak, SpectrumType.MS)
        SPNO = "PK$SPLASH: " + Splash().splash(spectrum)
        pknum = "PK$NUM_PEAK: " + str(len(pairpeak))
        maxY = max(peakY)
        pklist = "PK$PEAK: m/z int. rel.int.\n"
        for i in range(len(pairpeak)):
            pklist += ("  " + str(pairpeak[i][0]) + " " + str(pairpeak[i][1]) +
                       " " + str(round(pairpeak[i][1] / maxY * 999)) + "\n")
        pklist += "//"
        pkanno = "PK$ANNOTATION: m/z annotation\n"
        for i in range(len(peakinfo)):
            anno = peakinfo[i].split("\"")[1]
            anno = anno.replace(";", "\\")
            anno = anno.replace(" ", "_")
            anno = anno.replace("\\_", "\\")
            pkanno += ("  " + str(pairpeak[i][0]) + " " + anno + "\n")
        pkanno = pkanno[:-2]
        finallist.extend([SPNO, pkanno, pknum, pklist])
        return None
    elif annotation == "":
        for i in range(len(peakinfo)):
            calpeakinfo = peakinfo[i].strip()
            calpeakinfo = calpeakinfo.replace(" ", "\t")
            peakX.append(float(calpeakinfo.split("\t")[0]))
            peakY.append(float(calpeakinfo.split("\t")[1]))
        pairpeak = list(sorted(zip(peakX, peakY)))
        spectrum = Spectrum(pairpeak, SpectrumType.MS)
        SPNO = "PK$SPLASH: " + Splash().splash(spectrum)
        pknum = "PK$NUM_PEAK: " + str(len(pairpeak))
        maxY = max(peakY)
        pklist = "PK$PEAK: m/z int. rel.int.\n"
        for i in range(len(pairpeak)):
            pklist += ("  " + str(pairpeak[i][0]) + " " + str(pairpeak[i][1]) +
                       " " + str(round(pairpeak[i][1] / maxY * 999)) + "\n")
        pklist += "//"
        finallist.extend([SPNO, pknum, pklist])
        return None
    return ' '.join('%d:%.6f' % (k, v) for k, v in bins.items())

def bin_spectrum_accurate(spectrum):
    bins = collections.defaultdict(int)

    for mz, intensity in spectrum.spectrum:
        bins[get_bin(mz, 0.001)] += intensity

    return ' '.join('%d:%.6f' % (k, v) for k, v in bins.items())


if __name__ == '__main__':
    splasher = SplashVersion1()

    for i, line in enumerate(sys.stdin):
        line = line.strip()
        origin, spectrum_string = line.split(',')

        if spectrum_string:
            spectrum = Spectrum(spectrum_string, SpectrumType.MS)

            print(origin, spectrum_string,
                  bin_spectrum_nominal(spectrum),
                  bin_spectrum_accurate(spectrum),
                  splasher.encode_spectrum(spectrum),
                  splasher.calculate_histogram(spectrum), \
                  calculate_long_histogram(spectrum), calculate_sum(spectrum), \
                  calculate_precision_sum(spectrum), sep = ',')

        if i % 1000 == 0:
            print(i, file = sys.stderr)