def get_spectrum(self, location_batch): """TODO(nh2tran): docstring.""" #~ print("".join(["="] * 80)) # section-separating line #~ print("WorkerIO.get_spectrum()") spectrum_list = [] for location in location_batch: # parse a spectrum (precursor_mz, charge, scan, raw_sequence, mz_list, intensity_list) = self._parse_spectrum(location) # skip if precursor_mass > MZ_MAX precursor_mass = precursor_mz * charge - deepnovo_config.mass_H * charge if precursor_mass > self.MZ_MAX: self.spectrum_count["skipped"] += 1 self.spectrum_count["skipped_mass"] += 1 continue self.spectrum_count["read"] += 1 # pre-process spectrum (spectrum_holder, spectrum_original_forward, spectrum_original_backward) = process_spectrum(mz_list, intensity_list, precursor_mass) # update dataset spectrum = {"scan": scan, "precursor_mass": precursor_mass, "spectrum_holder": spectrum_holder, "spectrum_original_forward": spectrum_original_forward, "spectrum_original_backward": spectrum_original_backward} spectrum_list.append(spectrum) return spectrum_list
def _parse_spectrum(self, precursor_mz, precursor_mass, rt_mean, scan_list, ms1_list, input_file_handle): """TODO(nh2tran): docstring.""" #~ print("".join(["="] * 80)) # section-separating line #~ print("WorkerIO: _parse_spectrum()") spectrum_holder_list = [] spectrum_original_forward_list = [] spectrum_original_backward_list = [] ### select best neighbors from the scan_list by their distance to rt_mean # probably move this selection to get_location(), run once rather than repeating neighbor_count = len(scan_list) best_scan_index = None best_distance = float('inf') for scan_index, scan in enumerate(scan_list): distance = abs(self.spectrum_rtinseconds_dict[scan] - rt_mean) if distance < best_distance: best_distance = distance best_scan_index = scan_index neighbor_center = best_scan_index neighbor_left_count = neighbor_center neighbor_right_count = neighbor_count - neighbor_left_count - 1 neighbor_size_half = self.neighbor_size // 2 neighbor_left_count = min(neighbor_left_count, neighbor_size_half) neighbor_right_count = min(neighbor_right_count, neighbor_size_half) ### padding zero arrays to the left if not enough neighbor spectra if neighbor_left_count < neighbor_size_half: for x in range(neighbor_size_half - neighbor_left_count): spectrum_holder_list.append( np.zeros(shape=(1, self.MZ_SIZE), dtype=np.float32)) spectrum_original_forward_list.append( np.zeros(shape=(1, self.MZ_SIZE), dtype=np.float32)) spectrum_original_backward_list.append( np.zeros(shape=(1, self.MZ_SIZE), dtype=np.float32)) ### parse and add neighbor spectra scan_list_middle = [] ms1_intensity_list_middle = [] for index in range(neighbor_center - neighbor_left_count, neighbor_center + neighbor_right_count + 1): scan = scan_list[index] scan_list_middle.append(scan) ms1_entry = ms1_list[index] ms1_intensity = float(re.split(':', ms1_entry)[1]) ms1_intensity_list_middle.append(ms1_intensity) ms1_intensity_max = max(ms1_intensity_list_middle) assert ms1_intensity_max > 0.0, "Error: Zero ms1_intensity_max" ms1_intensity_list_middle = [ x / ms1_intensity_max for x in ms1_intensity_list_middle ] for scan, ms1_intensity in zip(scan_list_middle, ms1_intensity_list_middle): spectrum_location = self.spectrum_location_dict[scan] input_file_handle.seek(spectrum_location) # parse header lines line = input_file_handle.readline() assert "BEGIN IONS" in line, "Error: wrong input BEGIN IONS" line = input_file_handle.readline() assert "TITLE=" in line, "Error: wrong input TITLE=" line = input_file_handle.readline() assert "PEPMASS=" in line, "Error: wrong input PEPMASS=" line = input_file_handle.readline() assert "CHARGE=" in line, "Error: wrong input CHARGE=" line = input_file_handle.readline() assert "SCANS=" in line, "Error: wrong input SCANS=" line = input_file_handle.readline() assert "RTINSECONDS=" in line, "Error: wrong input RTINSECONDS=" # parse fragment ions mz_list, intensity_list = self._parse_spectrum_ion( input_file_handle) # pre-process spectrum (spectrum_holder, spectrum_original_forward, spectrum_original_backward) = process_spectrum( mz_list, intensity_list, precursor_mass) # normalize by each individual spectrum #~ spectrum_holder /= np.max(spectrum_holder) #~ spectrum_original_forward /= np.max(spectrum_original_forward) #~ spectrum_original_backward /= np.max(spectrum_original_backward) # weight by ms1 profile #~ spectrum_holder *= ms1_intensity #~ spectrum_original_forward *= ms1_intensity #~ spectrum_original_backward *= ms1_intensity # add spectrum to the neighbor list spectrum_holder_list.append(spectrum_holder) spectrum_original_forward_list.append(spectrum_original_forward) spectrum_original_backward_list.append(spectrum_original_backward) ### padding zero arrays to the right if not enough neighbor spectra if neighbor_right_count < neighbor_size_half: for x in range(neighbor_size_half - neighbor_right_count): spectrum_holder_list.append( np.zeros(shape=(1, self.MZ_SIZE), dtype=np.float32)) spectrum_original_forward_list.append( np.zeros(shape=(1, self.MZ_SIZE), dtype=np.float32)) spectrum_original_backward_list.append( np.zeros(shape=(1, self.MZ_SIZE), dtype=np.float32)) spectrum_holder = np.vstack(spectrum_holder_list) spectrum_original_forward = np.vstack(spectrum_original_forward_list) spectrum_original_backward = np.vstack(spectrum_original_backward_list) assert spectrum_holder.shape == (self.neighbor_size, self.MZ_SIZE), "Error:shape" # spectrum-CNN normalization: by feature spectrum_holder /= np.max(spectrum_holder) # ms1_profile for x in range(neighbor_size_half - neighbor_left_count): ms1_intensity_list_middle = [0.0] + ms1_intensity_list_middle for x in range(neighbor_size_half - neighbor_right_count): ms1_intensity_list_middle = ms1_intensity_list_middle + [0.0] assert len(ms1_intensity_list_middle ) == self.neighbor_size, "Error: ms1 profile" ms1_profile = np.array(ms1_intensity_list_middle) return spectrum_holder, spectrum_original_forward, spectrum_original_backward, scan_list_middle, scan_list, ms1_profile