def path_or_mzml(mzml): try: mzml = MZMLFile(mzml) except Exception: if (not type(mzml) == MZMLFile): raise NotImplementedError("Didn't recognise the MZMLFile!") return mzml
def get_summary(mzml_file_path): summary = {} mzml_file = MZMLFile(mzml_file_path) scan_sub = mzml_file.scans # find the first block of ms2 scans pos = 0 while pos < len(scan_sub) and scan_sub[pos].ms_level == 1: pos += 1 start_pos = pos while pos < len(scan_sub) and scan_sub[pos].ms_level == 2: pos += 1 end_pos = pos summary['First MS2'] = start_pos summary['First MS2 block'] = end_pos - start_pos summary['StartRT'] = scan_sub[0].rt_in_seconds summary['EndRT'] = scan_sub[-1].rt_in_seconds summary['Nscans'] = len(scan_sub) summary['Scans per sec'] = len(scan_sub) / (scan_sub[-1].rt_in_seconds - scan_sub[0].rt_in_seconds) summary['FileName'] = mzml_file_path.split(os.sep)[-1] ms1_scans = list(filter(lambda x: x.ms_level == 1, scan_sub)) ms2_scans = list(filter(lambda x: x.ms_level == 2, scan_sub)) summary['Nscans_MS1'] = len(ms1_scans) summary['Nscans_MS2'] = len(ms2_scans) return summary
def evaluate_mzml(mzml_file, picked_peaks_file, half_isolation_window): boxes = load_picked_boxes(picked_peaks_file) mz_file = MZMLFile(mzml_file) scans2boxes, boxes2scans = map_boxes_to_scans( mz_file, boxes, half_isolation_window=half_isolation_window) coverage = len(boxes2scans) return coverage
def run_coverage_evaluation(box_file, mzml_file, half_isolation_window): boxes = load_picked_boxes(box_file) mz_file = MZMLFile(mzml_file) scans2boxes, boxes2scans = map_boxes_to_scans( mz_file, boxes, half_isolation_window=half_isolation_window) coverage = len(boxes2scans) / len(boxes) return coverage
def get_box_intensity(mzml_file, boxes): intensities = [0 for i in range(len(boxes))] mzs = [None for i in range(len(boxes))] box_ids = range(len(boxes)) mz_file = MZMLFile(mzml_file) for scan in mz_file.scans: if scan.ms_level == 2: continue rt = scan.rt_in_seconds zipped_boxes = list( filter( lambda x: x[0].rt_range_in_seconds[0] <= rt <= x[0]. rt_range_in_seconds[1], zip(boxes, box_ids))) if not zipped_boxes: continue for mzint in scan.peaks: mz = mzint[0] sub_boxes = list( filter(lambda x: x[0].mz_range[0] <= mz <= x[0].mz_range[1], zipped_boxes)) if not sub_boxes: continue for box in sub_boxes: intensity = mzint[1] if intensity > intensities[box[1]]: intensities[box[1]] = intensity mzs[box[1]] = mz return intensities, mzs
def add_picked_peaks(self, mzml_file, peak_file, sample_name, picking_method='mzmine', sample_type=None, half_isolation_window=0, allow_last_overlap=False, rt_shifts=None, mz_shifts=None): """ TODO: add docstring comment Adds picked peak information to the aligner :param mzml_file: ??? :param peak_file: ??? :param sample_name: ??? :param picking_method: ??? :param sample_type: ??? :param half_isolation_window: ??? :param allow_last_overlap: ??? :param rt_shifts: ??? :param mz_shifts: ??? """ self.sample_names.append(sample_name) self.sample_types.append(sample_type) these_peaks = [] frag_intensities = [] # load boxes if picking_method == 'mzmine': temp_boxes = load_picked_boxes(peak_file) elif picking_method == 'peakonly': temp_boxes = load_peakonly_boxes(peak_file) # not tested elif picking_method == 'xcms': temp_boxes = load_xcms_boxes(peak_file) # not tested else: sys.exit('Method not supported') temp_boxes = update_picked_boxes(temp_boxes, rt_shifts, mz_shifts) self.list_of_boxes.append(temp_boxes) # Searching in boxes mzml = MZMLFile(mzml_file) scans2boxes, boxes2scans = map_boxes_to_scans( mzml, temp_boxes, half_isolation_window=half_isolation_window, allow_last_overlap=allow_last_overlap) precursor_intensities, scores = get_precursor_intensities( boxes2scans, temp_boxes, 'max') for i, box in enumerate(temp_boxes): source_id = sample_name + '_' + str(i) peak_mz = box.mz peak_rt = box.rt_in_seconds these_peaks.append( Peak(peak_mz, peak_rt, box.height, sample_name, source_id)) frag_intensities.append(precursor_intensities[i]) # do alignment, adding the peaks and boxes, and recalculating # max frag intensity self._align(these_peaks, temp_boxes, frag_intensities, sample_name)
def test_get_max_mass(self): os.chdir(self.test_files_folder) test_file = MZMLFile("0_pp_d20_pos_1.mzML") (max_i, max_mz) = self.kinetics_obj.get_max_mass( test_file.scans[0], 0, 1000) # These values were manually read for this specific scan via TOPPView self.assertEqual(max_i, 28188312.0) self.assertAlmostEqual(max_mz, 703.57514030)
def load_scans_from_mzml(mzml_file_name): logger.debug("Loading scans from {}".format(mzml_file_name)) mm = MZMLFile(mzml_file_name) ms2_scans = list(filter(lambda x: x.ms_level == 2, mm.scans)) spectra = {} for s in ms2_scans: spec_id = s.scan_no precursor_mz = s.precursor_mz peaks = s.peaks new_spectrum = Spectrum(precursor_mz, peaks) spectra[spec_id] = new_spectrum return spectra
def extract_timing(seed_file): """ Extracts timing information from a seed file :param seed_file: the seed file in mzML format If it's a DDA file (containing MS1 and MS2 scans) then both MS1 and MS2 timing will be extracted. If it's only a fullscan file (containing MS1 scans) then only MS1 timing will be extracted. :return: a dictionary of time information. Key should be the ms-level, 1 or 2, and value is the average time of scans at that level """ logger.debug('Extracting timing dictionary from seed file') seed_mzml = MZMLFile(seed_file) time_dict = {(1, 1): [], (1, 2): [], (2, 1): [], (2, 2): []} for i, s in enumerate(seed_mzml.scans[:-1]): current = s.ms_level next_ = seed_mzml.scans[i + 1].ms_level tup = (current, next_) time_dict[tup].append(60 * seed_mzml.scans[i + 1].rt_in_minutes - 60 * s.rt_in_minutes) is_frag_file = False if (1, 2) in time_dict and len(time_dict[(1, 2)]) > 0 and \ (2, 2) in time_dict and len(time_dict[(2, 2)]) > 0: # seed_file must contain timing on (1,2) and (2,2) # i.e. it must be a DDA file with MS1 and MS2 scans is_frag_file = True # construct timing dict in the right format for later use new_time_dict = {} if is_frag_file: # extract ms1 and ms2 timing from fragmentation mzML for k, v in time_dict.items(): if k == (1, 2): key = 1 elif k == (2, 2): key = 2 else: continue mean = sum(v) / len(v) new_time_dict[key] = mean logger.debug('%d: %f' % (key, mean)) assert 1 in new_time_dict and 2 in new_time_dict else: # extract ms1 timing only from fullscan mzML key = 1 v = time_dict[(1, 1)] mean = sum(v) / len(v) new_time_dict[key] = mean logger.debug('%d: %f' % (key, mean)) return new_time_dict
def main(mzml_file_name, msp_file_name, precursor_tolerance, hit_threshold): mzml_file_objects = {} if os.path.isfile(mzml_file_name): mzml_file_objects[mzml_file_name] = MZMLFile(mzml_file_name) elif os.path.isdir(mzml_file_name): mzml_files = glob.glob(os.path.join(mzml_file_name, '*.mzML')) for m in mzml_files: mzml_file_objects[m] = MZMLFile(m) else: logger.debug("No mzML files found") sys.exit(0) for m, mzml_file_object in mzml_file_objects.items(): logger.debug("Loaded {} scans from {}".format( len(mzml_file_object.scans), m)) sl = library_from_msp(msp_file_name) logger.debug("Created library from {}".format(msp_file_name)) hit_ids = set() for m, mzml_file_object in mzml_file_objects.items(): query_spectra = make_queries_from_mzml(mzml_file_object) for q in query_spectra: hits = sl.spectral_match(q, ms1_tol=precursor_tolerance, score_thresh=hit_threshold) for hit in hits: hit_id = hit[0] hit_ids.add(hit_id) all_library_ids = set(sl.records.keys()) n_library_ids = len(all_library_ids) n_hits = len(hit_ids) logger.debug("Out of {} IDs, {} got hits".format(n_library_ids, n_hits)) # missing_ids = all_library_ids - hit_ids # print("Missing") # for i in missing_ids: # print(i) return n_hits, n_library_ids
def evaluate_boxes_as_dict(boxes, out_dir): counts = {} for filename in glob.glob(os.path.join(out_dir, '*.mzML')): basename = os.path.basename(filename) mzml = MZMLFile(filename) scans2boxes, boxes2scans = map_boxes_to_scans(mzml, boxes, half_isolation_window=0) c = len(boxes2scans) logger.info('- %s: found %d boxes with scans' % (basename, c)) counts[basename] = c logger.debug(counts) return counts
def __init__(self, mzml_file, min_n_peaks=1, min_total_intensity=1e3, min_proportion=0.1, max_proportion=0.8, with_replacement=False): self.mzml_file_name = mzml_file self.mzml_object = MZMLFile(str(mzml_file)) self.min_n_peaks = min_n_peaks self.min_total_intensity = min_total_intensity self.with_replacement = with_replacement self.min_proportion = min_proportion self.max_proportion = max_proportion # only keep MS2 scans that have a least min_n_peaks and # a total intesity of at least min_total_intesity self._filter_scans()
def evaluate_boxes_as_array(boxes, out_dir, yticks, xticks, pattern, params): sample_name = params['sample_name'] counts = np.zeros((len(yticks), len(xticks))) for i, y in enumerate(yticks): for j, x in enumerate(xticks): try: fname = pattern.format(sample_name, y, x) mz_file = MZMLFile(os.path.join(out_dir, fname)) scans2boxes, boxes2scans = map_boxes_to_scans( mz_file, boxes, half_isolation_window=0) counts[i, j] = len(boxes2scans) except FileNotFoundError: counts[i, j] = np.nan logger.debug(counts) return counts
def main(): global rt parser = argparse.ArgumentParser(description='Create scan time plots') parser.add_argument('file_or_folder', type=str) parser.add_argument('--save_plots', dest='save_plots', action='store_true') args = parser.parse_args() if os.path.isdir(args.file_or_folder): print("Extracting mzml from folder") file_list = glob.glob(os.path.join(args.file_or_folder, '*.mzML')) else: print("Individual file") file_list = [args.file_or_folder] mzml_file_objects = {} timings = {} for mzml_file in file_list: mzml_file_objects[mzml_file] = MZMLFile(mzml_file) timings[mzml_file] = get_times(mzml_file_objects[mzml_file]) # plot for mo, t in timings.items(): nsp = len(t) # number of subplots plt.figure(figsize=(20, 8)) pos = 1 for k, v in t.items(): title = mo.split(os.sep)[-1] + str(k) plt.subplot(2, nsp, pos) plt.title(title) try: rt, de = zip(*v) except Exception: print("No data for " + str(k)) plt.hist(de) pos += 1 for k, v in t.items(): title = mo.split(os.sep)[-1] + str(k) plt.subplot(2, nsp, pos) plt.title(title) try: rt, de = zip(*v) plt.plot(rt, de, 'ro') except Exception: print("No data for " + str(k)) pos += 1 if args.save_plots: plot_filename = mo + '.png' plt.savefig(plot_filename) else: plt.show()
def update_scores(self, aligner, experiment_ids, experiment_scores, ms2_mzml): # get boxes boxes = [ aligner.peaksets2boxes[peakset] for peakset in aligner.peaksets ] # get box ids box_ids = [box.peak_id for box in boxes] # get box scores mz_file = MZMLFile( ms2_mzml) # TODO: change this to take last of list of mzmls scans2boxes, boxes2scans = map_boxes_to_scans( mz_file, boxes, 0.75) # TODO: add default parameter here box_scores = [(box in boxes2scans) * 1 for box in boxes] # store results for experiment experiment_scores.append(box_scores) experiment_ids.append(box_ids) if len(experiment_scores) > 1: # update old experiment results for j in range(len(experiment_scores) - 1): updated_experiment_ids = box_ids updated_experiment_scores = np.array([0 for id in box_ids]) for id in updated_experiment_ids: if id in experiment_ids[j]: where1 = np.where( np.array(updated_experiment_ids) == id)[0][0] where2 = np.where( np.array(experiment_ids[j]) == id)[0][0] updated_experiment_scores[where1] = np.array( experiment_scores[j])[where2] updated_experiment_scores = list(updated_experiment_scores) experiment_scores[j] = updated_experiment_scores experiment_ids[j] = updated_experiment_ids # get cumulative fragmentation and score cumulative_score = [] for i in range(len(experiment_scores)): updated_fragmentation = [ max(x) for x in zip(*self.experiment_scores[:i + 1]) ] cumulative_score.append(sum(updated_fragmentation)) self.cumulative_fragmentation = updated_fragmentation self.cumulative_score = cumulative_score # calculate updated intensities for each box self.box_intensities = self.calculate_box_intensities( self.completed_mzmls, boxes) return experiment_scores, experiment_ids, boxes
def create_scan_intensities(mzml_path, num_injection, schedule, mz_window): new_scans = [] original_scans = sorted( (s for s in MZMLFile(mzml_path).scans if s.ms_level == 1), key=lambda s: s.rt_in_seconds, reverse=True) left_rt, right_rt, mzs, intensities, owner, in_window = \ MatchingScan.interpolate_scan(original_scans[-1], original_scans[-2], mz_window) for s in schedule: try: ms_level, rt = s.ms_level, s.rt except AttributeError: ms_level, rt = s if (len(original_scans) > 1 and original_scans[-2].rt_in_seconds < rt): while (len(original_scans) > 1 and original_scans[-2].rt_in_seconds < rt): original_scans.pop() if (len(original_scans) > 1): left_rt, right_rt, mzs, intensities, owner, in_window = \ MatchingScan.interpolate_scan(original_scans[-1], original_scans[-2], mz_window) if (ms_level > 1 or len(original_scans) < 2): new_scans.append( MatchingScan(num_injection, ms_level, rt, [], [])) else: w = (rt - left_rt) / (right_rt - left_rt) weighted_intensities = (owner * (1 - w) * intensities + (1 - owner) * w * intensities) new_intensities = [] new_intensities = [ np.sum(weighted_intensities[left_bound:right_bound]) for (left_bound, right_bound) in in_window ] new_scans.append( MatchingScan(num_injection, ms_level, rt, mzs, new_intensities)) return new_scans
def _extract_timing(self, seed_file): """ Extracts timing information from a seed file :param seed_file: the seed file in mzML format If it's a DDA file (containing MS1 and MS2 scans) then both MS1 and MS2 timing will be extracted. If it's only a fullscan file (containing MS1 scans) then only MS1 timing will be extracted. :return: a dictionary of time information. Key should be the ms-level, 1 or 2, and value is the average time of scans at that level """ logger.debug('Extracting timing dictionary from seed file') seed_mzml = MZMLFile(seed_file) time_dict = {(1, 1): [], (1, 2): [], (2, 1): [], (2, 2): []} for i, s in enumerate(seed_mzml.scans[:-1]): current = s.ms_level next_ = seed_mzml.scans[i + 1].ms_level tup = (current, next_) time_dict[tup].append(60 * seed_mzml.scans[ i + 1].rt_in_minutes - 60 * s.rt_in_minutes) return time_dict
def _get_distributions(self): mzml_file_object = MZMLFile(str(self.mzml_file_name)) rt_bins = {} # mz_bins = {} for scan in mzml_file_object.scans: if not scan.ms_level == 1: continue mz, i = zip(*scan.peaks) total_intensity = sum(i) rt = scan.rt_in_seconds if rt < self.min_rt or rt > self.max_rt: continue rt_bin = int(rt) if rt_bin not in rt_bins: rt_bins[rt_bin] = total_intensity else: rt_bins[rt_bin] += total_intensity total_intensity = sum(rt_bins.values()) self.rt_bins = [(k, k + 1) for k in rt_bins.keys()] self.rt_probs = [v / total_intensity for v in rt_bins.values()] good = make_roi(str(self.mzml_file_name), self.roi_params) log_roi_intensities = [np.log(max(r.intensity_list)) for r in good] log_roi_intensities = filter( lambda x: self.min_log_intensity <= x <= self.max_log_intensity, log_roi_intensities ) log_roi_intensities = list(log_roi_intensities) hist, bin_edges = np.histogram(log_roi_intensities, bins=self.n_intensity_bins) total_i = hist.sum() hist = [h / total_i for h in hist] self.intensity_bins = [(b, bin_edges[i + 1]) for i, b in enumerate(bin_edges[:-1])] self.intensity_probs = [h for h in hist]
def _get_distributions(self): mzml_file_object = MZMLFile(str(self.mzml_file_name)) mz_bins = {} for scan in mzml_file_object.scans: if not scan.ms_level == 1: continue for mz, intensity in scan.peaks: if self.source_polarity == POSITIVE: mz -= PROTON_MASS elif self.source_polarity == NEGATIVE: mz += PROTON_MASS else: logger.warning("Unknown source polarity: {}".format( self.source_polarity)) if mz < self.min_mz or mz > self.max_mz: continue mz_bin = int(mz) if mz_bin not in mz_bins: mz_bins[mz_bin] = intensity else: mz_bins[mz_bin] += intensity total_intensity = sum(mz_bins.values()) self.mz_bins = [(k, k + 1) for k in mz_bins.keys()] self.mz_probs = [v / total_intensity for v in mz_bins.values()]
print("ARGUMENTS:") for arg in vars(args): print(arg, getattr(args, arg)) print() print() time_dict = {} if args.timing_file is None: print("No timing file provided") time_dict[1] = args.ms1_time time_dict[2] = args.ms2_time print("MS1 time = {}, MS2 time = {}".format(time_dict[1], time_dict[2])) else: print("Extracting times from {}".format(args.timing_file)) times = get_times(MZMLFile(args.timing_file)) for ms_level in [1, 2]: time_dict[ms_level] = np.array(times[ms_level]).mean() print("MS1 time = {}, MS2 time = {}".format(time_dict[1], time_dict[2])) if not args.time_factor == 1.: print("Changing times by factor {}".format(args.time_factor)) for ms_level in time_dict: time_dict[ms_level] *= args.time_factor print("MS1 time = {}, MS2 time = {}".format(time_dict[1], time_dict[2])) print() print() scan_levels, scan_start_times = setup_scans(time_dict, args.N, args.min_rt,
def get_isotope_intensities(self, lipid_details, filepair, scan_delta=2): filepair[0] = MZMLFile(filepair[0]) scans_in_range = list( filter( lambda x: x.rt_in_seconds >= lipid_details['retentionTime'] - lipid_details['retentionTimeTolerance'] and x.rt_in_seconds <= lipid_details['retentionTime'] + lipid_details[ 'retentionTimeTolerance'], filepair[0].scans)) spectrum = Formula(lipid_details['formula']).spectrum() adduct = lipid_details['adduct'] target_mass = [ mass2ion(x[0], adduct[2], adduct[1]) for x in spectrum.values() ] target_mass.sort() current_mass = target_mass[0] isotopes = [] if lipid_details['massToleranceUnits'] == 'ppm': absolute_mass_tolerance = self.ppm_to_da( current_mass, lipid_details['massTolerance']) else: absolute_mass_tolerance = lipid_details['massTolerance'] max_intensity = 0 max_intensity_index = 0 max_mass = 0 max_retention_time = 0 max_scan_no = 0 for scan in scans_in_range: intensity, exact_mass = self.get_max_mass( scan, current_mass - absolute_mass_tolerance, current_mass + absolute_mass_tolerance) if intensity >= max_intensity: max_intensity = intensity max_intensity_index = scans_in_range.index(scan) max_mass = exact_mass max_retention_time = scan.rt_in_seconds max_scan_no = scan.scan_no isotopes.append((0, current_mass, max_intensity, max_mass, max_retention_time, max_scan_no)) isotope_num = 0 for current_mass in target_mass[1:]: isotope_num += 1 if isotope_num > lipid_details['isotopeDepth']: break max_intensity = 0 max_mass = -1 max_retention_time = None max_scan_no = None if lipid_details['massToleranceUnits'] == 'ppm': absolute_mass_tolerance = self.ppm_to_da( current_mass, lipid_details['massTolerance']) else: absolute_mass_tolerance = lipid_details['massTolerance'] for scan_index in range(max_intensity_index - scan_delta, max_intensity_index + scan_delta + 1): if scan_index >= 0 and scan_index < len(scans_in_range): scan = scans_in_range[scan_index] intensity, exact_mass = self.get_max_mass( scan, current_mass - absolute_mass_tolerance, current_mass + absolute_mass_tolerance) if intensity >= max_intensity: max_intensity = intensity max_mass = exact_mass max_retention_time = scan.rt_in_seconds max_scan_no = scan.scan_no isotopes.append((isotope_num, current_mass, max_intensity, max_mass, max_retention_time, max_scan_no)) return isotopes