def test_invalid_spectra(self): with self.assertRaises(ValueError): Spectrum('', SpectrumType.MS) with self.assertRaises(ValueError): Spectrum([], SpectrumType.MS) with self.assertRaises(ValueError): Spectrum('1 10 2 5 3 5', SpectrumType.MS)
def create_splash(input_file, output_file, separator, spectrum_type, spectrum_col, origin_col): start_time = time.time() splasher = Splash() with open(input_file, 'r') as f, \ (open(output_file, 'w') if output_file is not None else sys.stdout) as fout: for i, line in enumerate(f): # Handle input line = line.strip().split(separator) origin = line[origin_col - 1] spectrum_string = line[spectrum_col - 1] spectrum = Spectrum(spectrum_string, spectrum_type) splash_code = splasher.splash(spectrum) # Print the spectrum id with the calculated splash id print(splash_code, *line, sep = separator, file = fout) if (i + 1) % 10000 == 0: print('processed %d spectra, %.2f ms average time to splash a spectrum' % (i + 1, 1000 * (time.time() - start_time) / (i + 1)), file = sys.stderr) print('finished processing, processing took: %.2f s' % (time.time() - start_time), file = sys.stderr) print('processed %d spectra' % (i + 1), file = sys.stderr) print('average time including io to splash a spectra is %.2f ms' % (1000 * (time.time() - start_time) / (i + 1)), file = sys.stderr)
def test_spectrum_3(self): spectrum_string = '85:1095.0 86:185.0 87:338.0 88:20.0 89:790.0 91:7688.0 92:1208.0 93:5630.0 94:838.0 95:7354.0 96:661.0 97:1634.0 99:410.0 101:712.0 102:182.0 103:1080.0 104:304.0 105:7310.0 106:1494.0 107:4463.0 108:768.0 109:2996.0 110:466.0 111:771.0 112:27.0 113:317.0 114:66.0 115:1485.0 116:673.0 117:2789.0 118:719.0 119:4687.0 120:2294.0 121:4237.0 122:489.0 123:1162.0 124:120.0 125:335.0 126:204.0 128:1042.0 129:15126.0 130:2142.0 131:3635.0 132:966.0 133:2697.0 134:681.0 135:1701.0 136:205.0 137:360.0 139:199.0 141:544.0 142:147.0 143:2163.0 144:394.0 145:3924.0 146:869.0 147:1624.0 148:615.0 149:1321.0 150:88.0 151:373.0 152:109.0 153:268.0 154:84.0 155:770.0 156:266.0 157:859.0 158:486.0 159:2227.0 160:1288.0 161:1723.0 162:387.0 163:949.0 164:178.0 165:359.0 166:52.0 167:62.0 168:118.0 169:352.0 170:79.0 171:675.0 172:238.0 173:1051.0 174:364.0 175:465.0 176:140.0 177:434.0 178:137.0 179:218.0 180:65.0 181:195.0 182:77.0 183:215.0 184:104.0 185:341.0 186:141.0 187:327.0 188:104.0 189:376.0 190:42.0 191:261.0 193:13.0 195:79.0 196:99.0 197:70.0 198:120.0 199:378.0 200:216.0 201:267.0 202:67.0 203:596.0 204:151.0 205:153.0 206:184.0 208:85.0 209:132.0 212:62.0 213:724.0 214:270.0 215:238.0 216:119.0 217:433.0 218:81.0 219:258.0 220:22.0 221:1.0 224:42.0 225:23.0 227:142.0 228:122.0 229:159.0 231:15.0 232:25.0 233:240.0 235:101.0 236:39.0 237:47.0 239:47.0 240:64.0 241:37.0 242:31.0 243:58.0 244:16.0 245:199.0 246:47.0 247:433.0 248:60.0 249:16.0 250:34.0 251:30.0 255:636.0 256:151.0 257:20.0 258:30.0 259:138.0 260:66.0 261:79.0 264:13.0 267:85.0 268:41.0 271:32.0 273:128.0 274:117.0 275:182.0 279:40.0 280:24.0 282:74.0 284:40.0 287:36.0 288:49.0 289:79.0 291:50.0 295:55.0 296:23.0 297:67.0 298:55.0 300:30.0 301:74.0 303:67.0 304:31.0 306:10.0 311:73.0 312:61.0 313:87.0 314:32.0 317:7.0 318:33.0 326:127.0 327:192.0 328:272.0 329:1729.0 330:473.0 331:136.0 339:47.0 340:21.0 342:30.0 349:32.0 351:29.0 353:641.0 354:300.0 355:14.0 368:1096.0 369:424.0 370:76.0 382:32.0 407:19.0 415:29.0 416:8.0 429:90.0 430:28.0 440:8.0 442:10.0 443:31.0 444:63.0 447:36.0 451:31.0 454:7.0 456:40.0 458:354.0 459:298.0 460:83.0 461:57.0 468:23.0 469:29.0 472:16.0 484:19.0 486:27.0' splash_code = 'splash10-056v-2900000000-f47edee35669c8f014c2' self.assertEqual( Splash().splash(Spectrum(spectrum_string, SpectrumType.MS)), splash_code)
def test_spectrum_1(self): spectrum_string = '66.0463:2.1827 105.0698:7.9976 103.0541:4.5676 130.065:8.6025 93.0572:0.2544 79.0542:4.4657 91.0541:2.5671 131.0728:2.6844 115.0541:1.3542 65.0384:0.6554 94.0412:0.5614 116.0494:1.2008 95.049:2.1338 117.0572:100 89.0385:11.7808 77.0385:3.3802 90.0463:35.6373 132.0806:2.343 105.0446:1.771' splash_code = 'splash10-014i-4900000000-889a38f7ace2626a0435' self.assertEqual( Splash().splash(Spectrum(spectrum_string, SpectrumType.MS)), splash_code)
def test_spectrum_2(self): spectrum_string = '303.07:100 662.26:1.2111 454.91:1.2023 433.25:0.8864 432.11:2.308 592.89:3.9052 259.99:0.6406 281.14:1.2549 451.34:1.1847 499.85:1.2374 482.14:2.4133 450:23.5191 483:1.0004 285.25:1.448 253.1:46.5731 254.11:3.247 259.13:6.9241 304.14:17.2795' splash_code = 'splash10-0udi-0049200000-ef488ecacceeaaadb4a2' self.assertEqual( Splash().splash(Spectrum(spectrum_string, SpectrumType.MS)), splash_code)
def callback(msms: PySpectrum, file_name: str): if msms is not None: highest = msms.highest_peaks(1)[0] spectra = msms.convert(msms).spectra splash = Splash().splash(Spectrum(spectra, SpectrumType.MS)) out.write("{};{};{};{};{};{}\n".format( msms.ms_level, highest[0], highest[1], msms.scan_time[0], splash, spectra))
def get_splash(peaks): peak_data = [] for peak in peaks: row = (peak[0], peak[1]) # mz, intensity print row peak_data.append(row) spectrum = Spectrum(peak_data, SpectrumType.MS) hash = Splash().splash(spectrum) print hash return hash
def get_SPLASH_from_pySPLASH(ion_list: List[Tuple[float, float]]) -> str: """ This function will generate a SPLASH key using pySPLASH installed in the local env. Args: List[Tuple[float, float]]: List of mass and intensity values. """ # Initialize the return splash_string = None spectra = Spectrum(ion_list, SpectrumType.MS) splash_string = Splash().splash(spectra) return splash_string
def callback(msms: PySpectrum, file_name: str): with db.atomic() as transaction: if msms is None: self.extract_record(file_name) else: # 3. load sample object record = MZMLSampleRecord.get( MZMLSampleRecord.file_name == file_name) # 3. associated msms spectra to it try: # 4. commit transaction highest = msms.highest_peaks(1)[0] spectra = msms.convert(msms, mode="raw").spectra precurosr = msms.selected_precursors[0] if len( msms.selected_precursors) > 0 else {} scannumber = msms.index splash = Splash().splash( Spectrum(spectra, SpectrumType.MS)) spectra = MZMLMSMSSpectraRecord.create( sample=record, msms=spectra, rt=msms.scan_time[0], splash=splash, scan_number=scannumber, level=msms.ms_level, base_peak=highest[0], base_peak_intensity=highest[1], precursor=precurosr['mz'] if 'mz' in precurosr else 0, precursor_intensity=precurosr['i'] if 'i' in precurosr else 0, precursor_charge=precurosr['charge'] if 'charge' in precurosr else 0, ion_count=len(msms.peaks("centroided"))) except IndexError as e: # not able to find highest peak pass
def test_spectrum_string(self): s = Spectrum("1:10 2:5 3:5", SpectrumType.MS) self.assertEqual(len(s.spectrum), 3) self.assertEqual(max(_[1] for _ in s.spectrum), 100.0)
def test_spectrum_list(self): s = Spectrum([(1, 10), (2, 5), (3, 5)], SpectrumType.MS) self.assertEqual(len(s.spectrum), 3) self.assertEqual(max(_[1] for _ in s.spectrum), 100.0)
def _encode(self, spec: Spectra, prefix: str = None, store_string: bool = False): """ encodes the given spectra :param spec: spectra :param prefix: prefix :param store_string: do you also want to store the spectra string for each spectra? :return: an image representation of the encoded spectra in form of a string """ # dumb approach to find max mz data = [] pairs = spec.spectra.split(" ") # convert spectra to arrays for pair in pairs: try: mass, intensity = pair.split(":") frac, whole = math.modf(float(mass)) data.append({ "intensity": float(intensity), "mz": float(mass), "nominal": int(whole), "frac": round(float(frac), 4) }) except ValueError: pass try: dataframe = pd.DataFrame( data, columns=["intensity", "mz", "nominal", "frac"]) # group by 5 digits dataframe = dataframe.groupby( dataframe['mz'].apply(lambda x: round(x, 5))).sum() # drop data outside min and max dataframe = dataframe[(dataframe['nominal'] >= self.min_mz) & (dataframe['nominal'] <= self.max_mz)] dataframe['intensity_min_max'] = ( dataframe['intensity'] - dataframe['intensity'].min() ) / (dataframe['intensity'].max() - dataframe['intensity'].min()) # formatting fig = plt.figure(figsize=(self.height / self.dpi, self.width / self.dpi), dpi=self.dpi) self._encode_dataframe(dataframe, fig) plt.tight_layout() fig.canvas.draw() spectra_string = fig.canvas.tostring_rgb() if self.directory is not None: name = Splash().splash(Spectrum(spec.spectra, SpectrumType.MS)) plt.subplots_adjust(left=0, right=1, top=1, bottom=0) directory = self.directory if prefix is None else "{}/{}".format( self.directory, prefix) pathlib.Path(directory).mkdir(parents=True, exist_ok=True) plt.savefig("{}/{}.png".format(directory, name), dpi=self.dpi) plt.close(fig=fig) if store_string: with open("{}/{}.txt".format(directory, name), 'w') as the_file: the_file.write(spec.spectra) return None return spectra_string except ValueError: pass
def convert(self, pattern: str = "%%"): """ converts the given input and stores it at the defined postgres database location :param input: :return: """ # 1. query all visble sample information db = config.config(filename="database.ini", section="binbase") connection = psycopg2.connect(**db) cursor = connection.cursor() sample_count = connection.cursor() spectra = connection.cursor() bin = connection.cursor() sample_count.execute( "select count(*) from samples where sample_name like '{}' and visible = 'TRUE'" .format(pattern)) count = sample_count.fetchone()[0] pbar = tqdm(total=count + 1, desc="importing samples for pattern {}".format(pattern)) cursor.execute( "select sample_id, sample_name from samples where sample_name like '{}' and visible = 'TRUE'" .format(pattern)) row = cursor.fetchone() while row is not None: try: try: record = MZMLSampleRecord.get( MZMLSampleRecord.file_name == row[1]) record.delete_instance() except Exception: # object doesn't exist pass # 2. create sample object MZMLSampleRecord.create(file_name=row[1], instrument="gctof", name=row[1]) record = MZMLSampleRecord.get( MZMLSampleRecord.file_name == row[1]) spectra.execute( "select bin_id, spectra_id,spectra, retention_time from spectra where sample_id = {}" .format(row[0])) s = spectra.fetchone() while s is not None: splash = Splash().splash(Spectrum(s[2], SpectrumType.MS)) spectrum = [ list(map(float, x.split(':'))) for x in s[2].strip().split() ] spectrum_max = max(spectrum, key=itemgetter(1)) MZMLMSMSSpectraRecord.create( sample=record, msms=s[2], rt=s[3], splash=splash, level=1, base_peak=spectrum_max[0], base_peak_intensity=spectrum_max[1], precursor=0, precursor_intensity=0, precursor_charge=0, ion_count=len(spectrum), scan_number=int(s[1])) if s[0] is not None: DatesetToPostgresConverter.classify( "bin_id", splash, s[0]) bin.execute( "select name from bin where bin_id = {} and bin_id::text != name" .format(s[0])) result = bin.fetchone() if result is not None: DatesetToPostgresConverter.classify( "bin", splash, result[0]) s = spectra.fetchone() row = cursor.fetchone() finally: pbar.update(1)
def peakprocess(beginline, endline="END", annotation=""): peakX = [] peakY = [] anno = [] if endline == "END": peakinfo = data.split(beginline)[1].split("\n")[1:] else: peakinfo1 = data.split(beginline)[1].split("\n")[1:] peakinfo2 = "\n".join(peakinfo1).split(endline)[0] peakinfo = peakinfo2.split("\n")[:-1] if annotation != "": tempinfo = [peakinfo[0]] for i in range(1, len(peakinfo)): if (peakinfo[i - 1].split(annotation)[0] != peakinfo[i].split(annotation)[0]): tempinfo.append(peakinfo[i]) peakinfo = tempinfo for i in range(len(peakinfo)): calpeakinfo = peakinfo[i].split(annotation)[0] calpeakinfo = calpeakinfo.strip() calpeakinfo = calpeakinfo.replace(" ", "\t") peakX.append(float(calpeakinfo.split("\t")[0])) peakY.append(float(calpeakinfo.split("\t")[1])) pairpeak = list(sorted(zip(peakX, peakY))) spectrum = Spectrum(pairpeak, SpectrumType.MS) SPNO = "PK$SPLASH: " + Splash().splash(spectrum) pknum = "PK$NUM_PEAK: " + str(len(pairpeak)) maxY = max(peakY) pklist = "PK$PEAK: m/z int. rel.int.\n" for i in range(len(pairpeak)): pklist += (" " + str(pairpeak[i][0]) + " " + str(pairpeak[i][1]) + " " + str(round(pairpeak[i][1] / maxY * 999)) + "\n") pklist += "//" pkanno = "PK$ANNOTATION: m/z annotation\n" for i in range(len(peakinfo)): anno = peakinfo[i].split("\"")[1] anno = anno.replace(";", "\\") anno = anno.replace(" ", "_") anno = anno.replace("\\_", "\\") pkanno += (" " + str(pairpeak[i][0]) + " " + anno + "\n") pkanno = pkanno[:-2] finallist.extend([SPNO, pkanno, pknum, pklist]) return None elif annotation == "": for i in range(len(peakinfo)): calpeakinfo = peakinfo[i].strip() calpeakinfo = calpeakinfo.replace(" ", "\t") peakX.append(float(calpeakinfo.split("\t")[0])) peakY.append(float(calpeakinfo.split("\t")[1])) pairpeak = list(sorted(zip(peakX, peakY))) spectrum = Spectrum(pairpeak, SpectrumType.MS) SPNO = "PK$SPLASH: " + Splash().splash(spectrum) pknum = "PK$NUM_PEAK: " + str(len(pairpeak)) maxY = max(peakY) pklist = "PK$PEAK: m/z int. rel.int.\n" for i in range(len(pairpeak)): pklist += (" " + str(pairpeak[i][0]) + " " + str(pairpeak[i][1]) + " " + str(round(pairpeak[i][1] / maxY * 999)) + "\n") pklist += "//" finallist.extend([SPNO, pknum, pklist]) return None
return ' '.join('%d:%.6f' % (k, v) for k, v in bins.items()) def bin_spectrum_accurate(spectrum): bins = collections.defaultdict(int) for mz, intensity in spectrum.spectrum: bins[get_bin(mz, 0.001)] += intensity return ' '.join('%d:%.6f' % (k, v) for k, v in bins.items()) if __name__ == '__main__': splasher = SplashVersion1() for i, line in enumerate(sys.stdin): line = line.strip() origin, spectrum_string = line.split(',') if spectrum_string: spectrum = Spectrum(spectrum_string, SpectrumType.MS) print(origin, spectrum_string, bin_spectrum_nominal(spectrum), bin_spectrum_accurate(spectrum), splasher.encode_spectrum(spectrum), splasher.calculate_histogram(spectrum), \ calculate_long_histogram(spectrum), calculate_sum(spectrum), \ calculate_precision_sum(spectrum), sep = ',') if i % 1000 == 0: print(i, file = sys.stderr)