Esempio n. 1
0
  def get_spectrum(self, location_batch):
    """TODO(nh2tran): docstring."""

    #~ print("".join(["="] * 80)) # section-separating line
    #~ print("WorkerIO.get_spectrum()")

    spectrum_list = []
    for location in location_batch:

      # parse a spectrum
      (precursor_mz,
       charge,
       scan,
       raw_sequence,
       mz_list,
       intensity_list) = self._parse_spectrum(location)

      # skip if precursor_mass > MZ_MAX
      precursor_mass = precursor_mz * charge - deepnovo_config.mass_H * charge
      if precursor_mass > self.MZ_MAX:
        self.spectrum_count["skipped"] += 1
        self.spectrum_count["skipped_mass"] += 1
        continue
      self.spectrum_count["read"] += 1

      # pre-process spectrum
      (spectrum_holder,
       spectrum_original_forward,
       spectrum_original_backward) = process_spectrum(mz_list,
                                                      intensity_list,
                                                      precursor_mass)

      # update dataset
      spectrum = {"scan": scan,
                  "precursor_mass": precursor_mass,
                  "spectrum_holder": spectrum_holder,
                  "spectrum_original_forward": spectrum_original_forward,
                  "spectrum_original_backward": spectrum_original_backward}
      spectrum_list.append(spectrum)

    return spectrum_list
Esempio n. 2
0
    def _parse_spectrum(self, precursor_mz, precursor_mass, rt_mean, scan_list,
                        ms1_list, input_file_handle):
        """TODO(nh2tran): docstring."""

        #~ print("".join(["="] * 80)) # section-separating line
        #~ print("WorkerIO: _parse_spectrum()")

        spectrum_holder_list = []
        spectrum_original_forward_list = []
        spectrum_original_backward_list = []

        ### select best neighbors from the scan_list by their distance to rt_mean
        # probably move this selection to get_location(), run once rather than repeating
        neighbor_count = len(scan_list)
        best_scan_index = None
        best_distance = float('inf')
        for scan_index, scan in enumerate(scan_list):
            distance = abs(self.spectrum_rtinseconds_dict[scan] - rt_mean)
            if distance < best_distance:
                best_distance = distance
                best_scan_index = scan_index
        neighbor_center = best_scan_index
        neighbor_left_count = neighbor_center
        neighbor_right_count = neighbor_count - neighbor_left_count - 1
        neighbor_size_half = self.neighbor_size // 2
        neighbor_left_count = min(neighbor_left_count, neighbor_size_half)
        neighbor_right_count = min(neighbor_right_count, neighbor_size_half)

        ### padding zero arrays to the left if not enough neighbor spectra
        if neighbor_left_count < neighbor_size_half:
            for x in range(neighbor_size_half - neighbor_left_count):
                spectrum_holder_list.append(
                    np.zeros(shape=(1, self.MZ_SIZE), dtype=np.float32))
                spectrum_original_forward_list.append(
                    np.zeros(shape=(1, self.MZ_SIZE), dtype=np.float32))
                spectrum_original_backward_list.append(
                    np.zeros(shape=(1, self.MZ_SIZE), dtype=np.float32))

        ### parse and add neighbor spectra
        scan_list_middle = []
        ms1_intensity_list_middle = []
        for index in range(neighbor_center - neighbor_left_count,
                           neighbor_center + neighbor_right_count + 1):
            scan = scan_list[index]
            scan_list_middle.append(scan)
            ms1_entry = ms1_list[index]
            ms1_intensity = float(re.split(':', ms1_entry)[1])
            ms1_intensity_list_middle.append(ms1_intensity)
        ms1_intensity_max = max(ms1_intensity_list_middle)
        assert ms1_intensity_max > 0.0, "Error: Zero ms1_intensity_max"
        ms1_intensity_list_middle = [
            x / ms1_intensity_max for x in ms1_intensity_list_middle
        ]
        for scan, ms1_intensity in zip(scan_list_middle,
                                       ms1_intensity_list_middle):
            spectrum_location = self.spectrum_location_dict[scan]
            input_file_handle.seek(spectrum_location)
            # parse header lines
            line = input_file_handle.readline()
            assert "BEGIN IONS" in line, "Error: wrong input BEGIN IONS"
            line = input_file_handle.readline()
            assert "TITLE=" in line, "Error: wrong input TITLE="
            line = input_file_handle.readline()
            assert "PEPMASS=" in line, "Error: wrong input PEPMASS="
            line = input_file_handle.readline()
            assert "CHARGE=" in line, "Error: wrong input CHARGE="
            line = input_file_handle.readline()
            assert "SCANS=" in line, "Error: wrong input SCANS="
            line = input_file_handle.readline()
            assert "RTINSECONDS=" in line, "Error: wrong input RTINSECONDS="
            # parse fragment ions
            mz_list, intensity_list = self._parse_spectrum_ion(
                input_file_handle)
            # pre-process spectrum
            (spectrum_holder, spectrum_original_forward,
             spectrum_original_backward) = process_spectrum(
                 mz_list, intensity_list, precursor_mass)
            # normalize by each individual spectrum
            #~ spectrum_holder /= np.max(spectrum_holder)
            #~ spectrum_original_forward /= np.max(spectrum_original_forward)
            #~ spectrum_original_backward /= np.max(spectrum_original_backward)
            # weight by ms1 profile
            #~ spectrum_holder *= ms1_intensity
            #~ spectrum_original_forward *= ms1_intensity
            #~ spectrum_original_backward *= ms1_intensity
            # add spectrum to the neighbor list
            spectrum_holder_list.append(spectrum_holder)
            spectrum_original_forward_list.append(spectrum_original_forward)
            spectrum_original_backward_list.append(spectrum_original_backward)
        ### padding zero arrays to the right if not enough neighbor spectra
        if neighbor_right_count < neighbor_size_half:
            for x in range(neighbor_size_half - neighbor_right_count):
                spectrum_holder_list.append(
                    np.zeros(shape=(1, self.MZ_SIZE), dtype=np.float32))
                spectrum_original_forward_list.append(
                    np.zeros(shape=(1, self.MZ_SIZE), dtype=np.float32))
                spectrum_original_backward_list.append(
                    np.zeros(shape=(1, self.MZ_SIZE), dtype=np.float32))

        spectrum_holder = np.vstack(spectrum_holder_list)
        spectrum_original_forward = np.vstack(spectrum_original_forward_list)
        spectrum_original_backward = np.vstack(spectrum_original_backward_list)
        assert spectrum_holder.shape == (self.neighbor_size,
                                         self.MZ_SIZE), "Error:shape"
        # spectrum-CNN normalization: by feature
        spectrum_holder /= np.max(spectrum_holder)

        # ms1_profile
        for x in range(neighbor_size_half - neighbor_left_count):
            ms1_intensity_list_middle = [0.0] + ms1_intensity_list_middle
        for x in range(neighbor_size_half - neighbor_right_count):
            ms1_intensity_list_middle = ms1_intensity_list_middle + [0.0]
        assert len(ms1_intensity_list_middle
                   ) == self.neighbor_size, "Error: ms1 profile"
        ms1_profile = np.array(ms1_intensity_list_middle)

        return spectrum_holder, spectrum_original_forward, spectrum_original_backward, scan_list_middle, scan_list, ms1_profile