Ejemplo n.º 1
0
def path_or_mzml(mzml):
    try:
        mzml = MZMLFile(mzml)
    except Exception:
        if (not type(mzml) == MZMLFile):
            raise NotImplementedError("Didn't recognise the MZMLFile!")
    return mzml
Ejemplo n.º 2
0
def get_summary(mzml_file_path):
    summary = {}
    mzml_file = MZMLFile(mzml_file_path)
    scan_sub = mzml_file.scans

    # find the first block of ms2 scans
    pos = 0
    while pos < len(scan_sub) and scan_sub[pos].ms_level == 1:
        pos += 1
    start_pos = pos

    while pos < len(scan_sub) and scan_sub[pos].ms_level == 2:
        pos += 1
    end_pos = pos

    summary['First MS2'] = start_pos
    summary['First MS2 block'] = end_pos - start_pos
    summary['StartRT'] = scan_sub[0].rt_in_seconds
    summary['EndRT'] = scan_sub[-1].rt_in_seconds
    summary['Nscans'] = len(scan_sub)
    summary['Scans per sec'] = len(scan_sub) / (scan_sub[-1].rt_in_seconds -
                                                scan_sub[0].rt_in_seconds)
    summary['FileName'] = mzml_file_path.split(os.sep)[-1]

    ms1_scans = list(filter(lambda x: x.ms_level == 1, scan_sub))
    ms2_scans = list(filter(lambda x: x.ms_level == 2, scan_sub))
    summary['Nscans_MS1'] = len(ms1_scans)
    summary['Nscans_MS2'] = len(ms2_scans)

    return summary
Ejemplo n.º 3
0
def evaluate_mzml(mzml_file, picked_peaks_file, half_isolation_window):
    boxes = load_picked_boxes(picked_peaks_file)
    mz_file = MZMLFile(mzml_file)
    scans2boxes, boxes2scans = map_boxes_to_scans(
        mz_file, boxes, half_isolation_window=half_isolation_window)
    coverage = len(boxes2scans)
    return coverage
Ejemplo n.º 4
0
def run_coverage_evaluation(box_file, mzml_file, half_isolation_window):
    boxes = load_picked_boxes(box_file)
    mz_file = MZMLFile(mzml_file)
    scans2boxes, boxes2scans = map_boxes_to_scans(
        mz_file, boxes, half_isolation_window=half_isolation_window)
    coverage = len(boxes2scans) / len(boxes)
    return coverage
Ejemplo n.º 5
0
def get_box_intensity(mzml_file, boxes):
    intensities = [0 for i in range(len(boxes))]
    mzs = [None for i in range(len(boxes))]
    box_ids = range(len(boxes))
    mz_file = MZMLFile(mzml_file)
    for scan in mz_file.scans:
        if scan.ms_level == 2:
            continue
        rt = scan.rt_in_seconds
        zipped_boxes = list(
            filter(
                lambda x: x[0].rt_range_in_seconds[0] <= rt <= x[0].
                rt_range_in_seconds[1], zip(boxes, box_ids)))
        if not zipped_boxes:
            continue
        for mzint in scan.peaks:
            mz = mzint[0]
            sub_boxes = list(
                filter(lambda x: x[0].mz_range[0] <= mz <= x[0].mz_range[1],
                       zipped_boxes))
            if not sub_boxes:
                continue
            for box in sub_boxes:
                intensity = mzint[1]
                if intensity > intensities[box[1]]:
                    intensities[box[1]] = intensity
                    mzs[box[1]] = mz
    return intensities, mzs
Ejemplo n.º 6
0
    def add_picked_peaks(self,
                         mzml_file,
                         peak_file,
                         sample_name,
                         picking_method='mzmine',
                         sample_type=None,
                         half_isolation_window=0,
                         allow_last_overlap=False,
                         rt_shifts=None,
                         mz_shifts=None):
        """
        TODO: add docstring comment
        Adds picked peak information to the aligner
        :param mzml_file: ???
        :param peak_file: ???
        :param sample_name: ???
        :param picking_method: ???
        :param sample_type: ???
        :param half_isolation_window: ???
        :param allow_last_overlap: ???
        :param rt_shifts: ???
        :param mz_shifts: ???
        """
        self.sample_names.append(sample_name)
        self.sample_types.append(sample_type)
        these_peaks = []
        frag_intensities = []
        # load boxes
        if picking_method == 'mzmine':
            temp_boxes = load_picked_boxes(peak_file)
        elif picking_method == 'peakonly':
            temp_boxes = load_peakonly_boxes(peak_file)  # not tested
        elif picking_method == 'xcms':
            temp_boxes = load_xcms_boxes(peak_file)  # not tested
        else:
            sys.exit('Method not supported')
        temp_boxes = update_picked_boxes(temp_boxes, rt_shifts, mz_shifts)
        self.list_of_boxes.append(temp_boxes)
        # Searching in boxes
        mzml = MZMLFile(mzml_file)
        scans2boxes, boxes2scans = map_boxes_to_scans(
            mzml,
            temp_boxes,
            half_isolation_window=half_isolation_window,
            allow_last_overlap=allow_last_overlap)
        precursor_intensities, scores = get_precursor_intensities(
            boxes2scans, temp_boxes, 'max')
        for i, box in enumerate(temp_boxes):
            source_id = sample_name + '_' + str(i)
            peak_mz = box.mz
            peak_rt = box.rt_in_seconds
            these_peaks.append(
                Peak(peak_mz, peak_rt, box.height, sample_name, source_id))
            frag_intensities.append(precursor_intensities[i])

        # do alignment, adding the peaks and boxes, and recalculating
        # max frag intensity
        self._align(these_peaks, temp_boxes, frag_intensities, sample_name)
    def test_get_max_mass(self):
        os.chdir(self.test_files_folder)
        test_file = MZMLFile("0_pp_d20_pos_1.mzML")
        (max_i, max_mz) = self.kinetics_obj.get_max_mass(
            test_file.scans[0], 0, 1000)

        # These values were manually read for this specific scan via TOPPView
        self.assertEqual(max_i, 28188312.0)
        self.assertAlmostEqual(max_mz, 703.57514030)
Ejemplo n.º 8
0
def load_scans_from_mzml(mzml_file_name):
    logger.debug("Loading scans from {}".format(mzml_file_name))
    mm = MZMLFile(mzml_file_name)
    ms2_scans = list(filter(lambda x: x.ms_level == 2, mm.scans))
    spectra = {}
    for s in ms2_scans:
        spec_id = s.scan_no
        precursor_mz = s.precursor_mz
        peaks = s.peaks
        new_spectrum = Spectrum(precursor_mz, peaks)
        spectra[spec_id] = new_spectrum
    return spectra
Ejemplo n.º 9
0
def extract_timing(seed_file):
    """
    Extracts timing information from a seed file
    :param seed_file: the seed file in mzML format
    If it's a DDA file (containing MS1 and MS2 scans) then both MS1 and MS2
    timing will be extracted.
    If it's only a fullscan file (containing MS1 scans) then only MS1 timing will be extracted.
    :return: a dictionary of time information. Key should be the ms-level, 1 or 2, and
    value is the average time of scans at that level
    """
    logger.debug('Extracting timing dictionary from seed file')
    seed_mzml = MZMLFile(seed_file)

    time_dict = {(1, 1): [], (1, 2): [], (2, 1): [], (2, 2): []}
    for i, s in enumerate(seed_mzml.scans[:-1]):
        current = s.ms_level
        next_ = seed_mzml.scans[i + 1].ms_level
        tup = (current, next_)
        time_dict[tup].append(60 * seed_mzml.scans[i + 1].rt_in_minutes -
                              60 * s.rt_in_minutes)

    is_frag_file = False
    if (1, 2) in time_dict and len(time_dict[(1, 2)]) > 0 and \
            (2, 2) in time_dict and len(time_dict[(2, 2)]) > 0:
        # seed_file must contain timing on (1,2) and (2,2)
        # i.e. it must be a DDA file with MS1 and MS2 scans
        is_frag_file = True

    # construct timing dict in the right format for later use
    new_time_dict = {}
    if is_frag_file:
        # extract ms1 and ms2 timing from fragmentation mzML
        for k, v in time_dict.items():
            if k == (1, 2):
                key = 1
            elif k == (2, 2):
                key = 2
            else:
                continue

            mean = sum(v) / len(v)
            new_time_dict[key] = mean
            logger.debug('%d: %f' % (key, mean))
        assert 1 in new_time_dict and 2 in new_time_dict
    else:
        # extract ms1 timing only from fullscan mzML
        key = 1
        v = time_dict[(1, 1)]
        mean = sum(v) / len(v)
        new_time_dict[key] = mean
        logger.debug('%d: %f' % (key, mean))

    return new_time_dict
Ejemplo n.º 10
0
def main(mzml_file_name, msp_file_name, precursor_tolerance, hit_threshold):
    mzml_file_objects = {}
    if os.path.isfile(mzml_file_name):
        mzml_file_objects[mzml_file_name] = MZMLFile(mzml_file_name)
    elif os.path.isdir(mzml_file_name):
        mzml_files = glob.glob(os.path.join(mzml_file_name, '*.mzML'))
        for m in mzml_files:
            mzml_file_objects[m] = MZMLFile(m)
    else:
        logger.debug("No mzML files found")
        sys.exit(0)

    for m, mzml_file_object in mzml_file_objects.items():
        logger.debug("Loaded {} scans from {}".format(
            len(mzml_file_object.scans), m))

    sl = library_from_msp(msp_file_name)
    logger.debug("Created library from {}".format(msp_file_name))

    hit_ids = set()
    for m, mzml_file_object in mzml_file_objects.items():
        query_spectra = make_queries_from_mzml(mzml_file_object)
        for q in query_spectra:
            hits = sl.spectral_match(q,
                                     ms1_tol=precursor_tolerance,
                                     score_thresh=hit_threshold)
            for hit in hits:
                hit_id = hit[0]
                hit_ids.add(hit_id)

    all_library_ids = set(sl.records.keys())
    n_library_ids = len(all_library_ids)
    n_hits = len(hit_ids)
    logger.debug("Out of {} IDs, {} got hits".format(n_library_ids, n_hits))
    # missing_ids = all_library_ids - hit_ids
    # print("Missing")
    # for i in missing_ids:
    #     print(i)
    return n_hits, n_library_ids
Ejemplo n.º 11
0
def evaluate_boxes_as_dict(boxes, out_dir):
    counts = {}
    for filename in glob.glob(os.path.join(out_dir, '*.mzML')):
        basename = os.path.basename(filename)
        mzml = MZMLFile(filename)
        scans2boxes, boxes2scans = map_boxes_to_scans(mzml,
                                                      boxes,
                                                      half_isolation_window=0)
        c = len(boxes2scans)
        logger.info('- %s: found %d boxes with scans' % (basename, c))
        counts[basename] = c
    logger.debug(counts)
    return counts
Ejemplo n.º 12
0
    def __init__(self, mzml_file, min_n_peaks=1, min_total_intensity=1e3,
                 min_proportion=0.1, max_proportion=0.8,
                 with_replacement=False):
        self.mzml_file_name = mzml_file
        self.mzml_object = MZMLFile(str(mzml_file))
        self.min_n_peaks = min_n_peaks
        self.min_total_intensity = min_total_intensity
        self.with_replacement = with_replacement

        self.min_proportion = min_proportion
        self.max_proportion = max_proportion

        # only keep MS2 scans that have a least min_n_peaks and
        # a total intesity of at least min_total_intesity
        self._filter_scans()
Ejemplo n.º 13
0
def evaluate_boxes_as_array(boxes, out_dir, yticks, xticks, pattern, params):
    sample_name = params['sample_name']
    counts = np.zeros((len(yticks), len(xticks)))
    for i, y in enumerate(yticks):
        for j, x in enumerate(xticks):
            try:
                fname = pattern.format(sample_name, y, x)
                mz_file = MZMLFile(os.path.join(out_dir, fname))
                scans2boxes, boxes2scans = map_boxes_to_scans(
                    mz_file, boxes, half_isolation_window=0)
                counts[i, j] = len(boxes2scans)
            except FileNotFoundError:
                counts[i, j] = np.nan
            logger.debug(counts)
    return counts
Ejemplo n.º 14
0
def main():
    global rt
    parser = argparse.ArgumentParser(description='Create scan time plots')
    parser.add_argument('file_or_folder', type=str)
    parser.add_argument('--save_plots', dest='save_plots', action='store_true')
    args = parser.parse_args()
    if os.path.isdir(args.file_or_folder):
        print("Extracting mzml from folder")
        file_list = glob.glob(os.path.join(args.file_or_folder, '*.mzML'))
    else:
        print("Individual file")
        file_list = [args.file_or_folder]
    mzml_file_objects = {}
    timings = {}
    for mzml_file in file_list:
        mzml_file_objects[mzml_file] = MZMLFile(mzml_file)
        timings[mzml_file] = get_times(mzml_file_objects[mzml_file])
    # plot
    for mo, t in timings.items():
        nsp = len(t)  # number of subplots
        plt.figure(figsize=(20, 8))
        pos = 1
        for k, v in t.items():
            title = mo.split(os.sep)[-1] + str(k)
            plt.subplot(2, nsp, pos)
            plt.title(title)
            try:
                rt, de = zip(*v)
            except Exception:
                print("No data for " + str(k))
            plt.hist(de)
            pos += 1
        for k, v in t.items():
            title = mo.split(os.sep)[-1] + str(k)
            plt.subplot(2, nsp, pos)
            plt.title(title)
            try:
                rt, de = zip(*v)
                plt.plot(rt, de, 'ro')
            except Exception:
                print("No data for " + str(k))
            pos += 1

        if args.save_plots:
            plot_filename = mo + '.png'
            plt.savefig(plot_filename)
        else:
            plt.show()
Ejemplo n.º 15
0
 def update_scores(self, aligner, experiment_ids, experiment_scores,
                   ms2_mzml):
     # get boxes
     boxes = [
         aligner.peaksets2boxes[peakset] for peakset in aligner.peaksets
     ]
     # get box ids
     box_ids = [box.peak_id for box in boxes]
     # get box scores
     mz_file = MZMLFile(
         ms2_mzml)  # TODO: change this to take last of list of mzmls
     scans2boxes, boxes2scans = map_boxes_to_scans(
         mz_file, boxes, 0.75)  # TODO: add default parameter here
     box_scores = [(box in boxes2scans) * 1 for box in boxes]
     # store results for experiment
     experiment_scores.append(box_scores)
     experiment_ids.append(box_ids)
     if len(experiment_scores) > 1:
         # update old experiment results
         for j in range(len(experiment_scores) - 1):
             updated_experiment_ids = box_ids
             updated_experiment_scores = np.array([0 for id in box_ids])
             for id in updated_experiment_ids:
                 if id in experiment_ids[j]:
                     where1 = np.where(
                         np.array(updated_experiment_ids) == id)[0][0]
                     where2 = np.where(
                         np.array(experiment_ids[j]) == id)[0][0]
                     updated_experiment_scores[where1] = np.array(
                         experiment_scores[j])[where2]
             updated_experiment_scores = list(updated_experiment_scores)
             experiment_scores[j] = updated_experiment_scores
             experiment_ids[j] = updated_experiment_ids
     # get cumulative fragmentation and score
     cumulative_score = []
     for i in range(len(experiment_scores)):
         updated_fragmentation = [
             max(x) for x in zip(*self.experiment_scores[:i + 1])
         ]
         cumulative_score.append(sum(updated_fragmentation))
     self.cumulative_fragmentation = updated_fragmentation
     self.cumulative_score = cumulative_score
     # calculate updated intensities for each box
     self.box_intensities = self.calculate_box_intensities(
         self.completed_mzmls, boxes)
     return experiment_scores, experiment_ids, boxes
Ejemplo n.º 16
0
    def create_scan_intensities(mzml_path, num_injection, schedule, mz_window):
        new_scans = []
        original_scans = sorted(
            (s for s in MZMLFile(mzml_path).scans if s.ms_level == 1),
            key=lambda s: s.rt_in_seconds,
            reverse=True)

        left_rt, right_rt, mzs, intensities, owner, in_window = \
            MatchingScan.interpolate_scan(original_scans[-1],
                                          original_scans[-2], mz_window)

        for s in schedule:
            try:
                ms_level, rt = s.ms_level, s.rt
            except AttributeError:
                ms_level, rt = s

            if (len(original_scans) > 1
                    and original_scans[-2].rt_in_seconds < rt):
                while (len(original_scans) > 1
                       and original_scans[-2].rt_in_seconds < rt):
                    original_scans.pop()
                if (len(original_scans) > 1):
                    left_rt, right_rt, mzs, intensities, owner, in_window = \
                        MatchingScan.interpolate_scan(original_scans[-1],
                                                      original_scans[-2],
                                                      mz_window)

            if (ms_level > 1 or len(original_scans) < 2):
                new_scans.append(
                    MatchingScan(num_injection, ms_level, rt, [], []))
            else:
                w = (rt - left_rt) / (right_rt - left_rt)
                weighted_intensities = (owner * (1 - w) * intensities +
                                        (1 - owner) * w * intensities)

                new_intensities = []
                new_intensities = [
                    np.sum(weighted_intensities[left_bound:right_bound])
                    for (left_bound, right_bound) in in_window
                ]
                new_scans.append(
                    MatchingScan(num_injection, ms_level, rt, mzs,
                                 new_intensities))

        return new_scans
Ejemplo n.º 17
0
    def _extract_timing(self, seed_file):
        """
        Extracts timing information from a seed file
        :param seed_file: the seed file in mzML format
        If it's a DDA file (containing MS1 and MS2 scans) then both MS1 and
        MS2 timing will be extracted.
        If it's only a fullscan file (containing MS1 scans) then only MS1
        timing will be extracted.
        :return: a dictionary of time information. Key should be the ms-level,
        1 or 2, and
        value is the average time of scans at that level
        """
        logger.debug('Extracting timing dictionary from seed file')
        seed_mzml = MZMLFile(seed_file)

        time_dict = {(1, 1): [], (1, 2): [], (2, 1): [], (2, 2): []}
        for i, s in enumerate(seed_mzml.scans[:-1]):
            current = s.ms_level
            next_ = seed_mzml.scans[i + 1].ms_level
            tup = (current, next_)
            time_dict[tup].append(60 * seed_mzml.scans[
                i + 1].rt_in_minutes - 60 * s.rt_in_minutes)
        return time_dict
Ejemplo n.º 18
0
    def _get_distributions(self):
        mzml_file_object = MZMLFile(str(self.mzml_file_name))
        rt_bins = {}
        # mz_bins = {}
        for scan in mzml_file_object.scans:
            if not scan.ms_level == 1:
                continue
            mz, i = zip(*scan.peaks)
            total_intensity = sum(i)
            rt = scan.rt_in_seconds
            if rt < self.min_rt or rt > self.max_rt:
                continue
            rt_bin = int(rt)
            if rt_bin not in rt_bins:
                rt_bins[rt_bin] = total_intensity
            else:
                rt_bins[rt_bin] += total_intensity
        total_intensity = sum(rt_bins.values())
        self.rt_bins = [(k, k + 1) for k in rt_bins.keys()]
        self.rt_probs = [v / total_intensity for v in rt_bins.values()]

        good = make_roi(str(self.mzml_file_name), self.roi_params)
        log_roi_intensities = [np.log(max(r.intensity_list)) for r in good]
        log_roi_intensities = filter(
            lambda x: self.min_log_intensity <= x <= self.max_log_intensity,
            log_roi_intensities
        )
        log_roi_intensities = list(log_roi_intensities)
        hist, bin_edges = np.histogram(log_roi_intensities,
                                       bins=self.n_intensity_bins)
        total_i = hist.sum()
        hist = [h / total_i for h in hist]

        self.intensity_bins = [(b, bin_edges[i + 1]) for i, b in
                               enumerate(bin_edges[:-1])]
        self.intensity_probs = [h for h in hist]
Ejemplo n.º 19
0
 def _get_distributions(self):
     mzml_file_object = MZMLFile(str(self.mzml_file_name))
     mz_bins = {}
     for scan in mzml_file_object.scans:
         if not scan.ms_level == 1:
             continue
         for mz, intensity in scan.peaks:
             if self.source_polarity == POSITIVE:
                 mz -= PROTON_MASS
             elif self.source_polarity == NEGATIVE:
                 mz += PROTON_MASS
             else:
                 logger.warning("Unknown source polarity: {}".format(
                     self.source_polarity))
             if mz < self.min_mz or mz > self.max_mz:
                 continue
             mz_bin = int(mz)
             if mz_bin not in mz_bins:
                 mz_bins[mz_bin] = intensity
             else:
                 mz_bins[mz_bin] += intensity
     total_intensity = sum(mz_bins.values())
     self.mz_bins = [(k, k + 1) for k in mz_bins.keys()]
     self.mz_probs = [v / total_intensity for v in mz_bins.values()]
Ejemplo n.º 20
0
    print("ARGUMENTS:")
    for arg in vars(args):
        print(arg, getattr(args, arg))

    print()
    print()
    time_dict = {}
    if args.timing_file is None:
        print("No timing file provided")
        time_dict[1] = args.ms1_time
        time_dict[2] = args.ms2_time
        print("MS1 time = {}, MS2 time = {}".format(time_dict[1],
                                                    time_dict[2]))
    else:
        print("Extracting times from {}".format(args.timing_file))
        times = get_times(MZMLFile(args.timing_file))
        for ms_level in [1, 2]:
            time_dict[ms_level] = np.array(times[ms_level]).mean()
        print("MS1 time = {}, MS2 time = {}".format(time_dict[1],
                                                    time_dict[2]))

    if not args.time_factor == 1.:
        print("Changing times by factor {}".format(args.time_factor))
        for ms_level in time_dict:
            time_dict[ms_level] *= args.time_factor
        print("MS1 time = {}, MS2 time = {}".format(time_dict[1],
                                                    time_dict[2]))

    print()
    print()
    scan_levels, scan_start_times = setup_scans(time_dict, args.N, args.min_rt,
    def get_isotope_intensities(self, lipid_details, filepair, scan_delta=2):
        filepair[0] = MZMLFile(filepair[0])

        scans_in_range = list(
            filter(
                lambda x: x.rt_in_seconds >= lipid_details['retentionTime'] -
                lipid_details['retentionTimeTolerance'] and x.rt_in_seconds <=
                lipid_details['retentionTime'] + lipid_details[
                    'retentionTimeTolerance'], filepair[0].scans))

        spectrum = Formula(lipid_details['formula']).spectrum()
        adduct = lipid_details['adduct']
        target_mass = [
            mass2ion(x[0], adduct[2], adduct[1]) for x in spectrum.values()
        ]
        target_mass.sort()
        current_mass = target_mass[0]

        isotopes = []
        if lipid_details['massToleranceUnits'] == 'ppm':
            absolute_mass_tolerance = self.ppm_to_da(
                current_mass, lipid_details['massTolerance'])
        else:
            absolute_mass_tolerance = lipid_details['massTolerance']
        max_intensity = 0
        max_intensity_index = 0
        max_mass = 0
        max_retention_time = 0
        max_scan_no = 0

        for scan in scans_in_range:
            intensity, exact_mass = self.get_max_mass(
                scan, current_mass - absolute_mass_tolerance,
                current_mass + absolute_mass_tolerance)
            if intensity >= max_intensity:
                max_intensity = intensity
                max_intensity_index = scans_in_range.index(scan)
                max_mass = exact_mass
                max_retention_time = scan.rt_in_seconds
                max_scan_no = scan.scan_no
        isotopes.append((0, current_mass, max_intensity, max_mass,
                         max_retention_time, max_scan_no))
        isotope_num = 0
        for current_mass in target_mass[1:]:
            isotope_num += 1
            if isotope_num > lipid_details['isotopeDepth']:
                break
            max_intensity = 0
            max_mass = -1
            max_retention_time = None
            max_scan_no = None

            if lipid_details['massToleranceUnits'] == 'ppm':
                absolute_mass_tolerance = self.ppm_to_da(
                    current_mass, lipid_details['massTolerance'])
            else:
                absolute_mass_tolerance = lipid_details['massTolerance']

            for scan_index in range(max_intensity_index - scan_delta,
                                    max_intensity_index + scan_delta + 1):

                if scan_index >= 0 and scan_index < len(scans_in_range):
                    scan = scans_in_range[scan_index]
                    intensity, exact_mass = self.get_max_mass(
                        scan, current_mass - absolute_mass_tolerance,
                        current_mass + absolute_mass_tolerance)
                    if intensity >= max_intensity:
                        max_intensity = intensity
                        max_mass = exact_mass
                        max_retention_time = scan.rt_in_seconds
                        max_scan_no = scan.scan_no
            isotopes.append((isotope_num, current_mass, max_intensity,
                             max_mass, max_retention_time, max_scan_no))
        return isotopes