def test_negative_fixed(self): fs = EvenMZFormulaSampler() ms = FixedMS2Sampler() ri = UniformRTAndIntensitySampler(min_rt=100, max_rt=101) cs = ConstantChromatogramSampler() cm = ChemicalMixtureCreator(fs, ms2_sampler=ms, rt_and_intensity_sampler=ri, chromatogram_sampler=cs) dataset = cm.sample(3, 2) N = 10 isolation_width = 0.7 mz_tol = 10 rt_tol = 15 ms = IndependentMassSpectrometer(NEGATIVE, dataset) controller = TopNController(NEGATIVE, N, isolation_width, mz_tol, rt_tol, MIN_MS1_INTENSITY) env = Environment(ms, controller, 102, 110, progress_bar=True) set_log_level_warning() env.run() ms1_mz_vals = [int(m) for m in controller.scans[1][0].mzs] expected_vals = [98, 198, 298] for i, m in enumerate(ms1_mz_vals): assert m == expected_vals[i] expected_frags = set([88, 78, 188, 178, 288, 278]) for scan in controller.scans[2]: for m in scan.mzs: assert int(m) in expected_frags
def test_multiple_adducts(self): fs = DatabaseFormulaSampler(HMDB) ri = UniformRTAndIntensitySampler(min_rt=100, max_rt=101) cs = ConstantChromatogramSampler() adduct_prior_dict = {POSITIVE: {'M+H': 100, 'M+Na': 100, 'M+K': 100}} cm = ChemicalMixtureCreator(fs, rt_and_intensity_sampler=ri, chromatogram_sampler=cs, adduct_prior_dict=adduct_prior_dict, adduct_proportion_cutoff=0.0) n_adducts = len(adduct_prior_dict[POSITIVE]) n_chems = 5 dataset = cm.sample(n_chems, 2) for c in dataset: c.isotopes = [(c.mass, 1, "Mono")] # should be 15 peaks or less all the time # some adducts might not be sampled if the probability is less than 0.2 controller = SimpleMs1Controller() ms = IndependentMassSpectrometer(POSITIVE, dataset) env = Environment(ms, controller, 102, 110, progress_bar=True) set_log_level_warning() env.run() for scan in controller.scans[1]: assert len(scan.mzs) <= n_chems * n_adducts
def ten_chems(): np.random.seed(0) rand.seed(0) um = UniformMZFormulaSampler(min_mz=MZ_RANGE[0][0], max_mz=MZ_RANGE[0][1]) ri = UniformRTAndIntensitySampler(min_rt=200, max_rt=300) cs = GaussianChromatogramSampler() cm = ChemicalMixtureCreator(um, rt_and_intensity_sampler=ri, chromatogram_sampler=cs) return cm.sample(10, 2)
def simple_dataset(): np.random.seed(0) rand.seed(0) um = UniformMZFormulaSampler(min_mz=515, max_mz=516) ri = UniformRTAndIntensitySampler(min_rt=150, max_rt=160) cs = GaussianChromatogramSampler(sigma=100) cm = ChemicalMixtureCreator(um, rt_and_intensity_sampler=ri, chromatogram_sampler=cs) return cm.sample(1, 2)
def test_rt_from_mzml(self): ri = MZMLRTandIntensitySampler(MZML_FILE) fs = MZMLFormulaSampler(MZML_FILE) cs = MZMLChromatogramSampler(MZML_FILE) cm = ChemicalMixtureCreator(fs, rt_and_intensity_sampler=ri, chromatogram_sampler=cs) cm.sample(100, 2)
def two_fixed_chems(): np.random.seed(0) rand.seed(0) em = EvenMZFormulaSampler() ri = UniformRTAndIntensitySampler(min_rt=100, max_rt=101) cs = ConstantChromatogramSampler() cm = ChemicalMixtureCreator(em, rt_and_intensity_sampler=ri, chromatogram_sampler=cs) return cm.sample(2, 2)
def chem_mz_rt_i_from_mzml(): np.random.seed(0) rand.seed(0) fs = MZMLFormulaSampler(MZML_FILE) ri = MZMLRTandIntensitySampler(MZML_FILE) cs = MZMLChromatogramSampler(MZML_FILE) cm = ChemicalMixtureCreator(fs, rt_and_intensity_sampler=ri, chromatogram_sampler=cs) return cm.sample(500, 2)
def simple_no_database_dataset(): ri = UniformRTAndIntensitySampler(min_rt=RT_RANGE[0][0], max_rt=RT_RANGE[0][1]) hf = UniformMZFormulaSampler() cc = ChemicalMixtureCreator(hf, rt_and_intensity_sampler=ri, adduct_prior_dict=ADDUCT_DICT_POS_MH) d = cc.sample(N_CHEMS, 2) return d
def test_mzml_ms2(self): min_n_peaks = 50 ms = MZMLMS2Sampler(MZML_FILE, min_n_peaks=min_n_peaks) ud = UniformMZFormulaSampler() cm = ChemicalMixtureCreator(ud, ms2_sampler=ms) d = cm.sample(N_CHEMS, 2) for chem in d: assert len(chem.children) >= min_n_peaks
def test_mz_creation(self): ri = UniformRTAndIntensitySampler(min_rt=RT_RANGE[0][0], max_rt=RT_RANGE[0][1]) hf = UniformMZFormulaSampler(min_mz=MZ_RANGE[0][0], max_mz=MZ_RANGE[0][1]) cc = ChemicalMixtureCreator(hf, rt_and_intensity_sampler=ri) d = cc.sample(N_CHEMS, 2) check_chems(d)
def even_chems(): np.random.seed(0) rand.seed(0) # four evenly spaced chems for more advanced SWATH testing em = EvenMZFormulaSampler() ri = UniformRTAndIntensitySampler(min_rt=100, max_rt=101) cs = ConstantChromatogramSampler() cm = ChemicalMixtureCreator(em, rt_and_intensity_sampler=ri, chromatogram_sampler=cs, adduct_prior_dict=ADDUCT_DICT_POS_MH) return cm.sample(4, 2)
def test_ms2_mgf(self): hf = DatabaseFormulaSampler(HMDB, min_mz=MZ_RANGE[0][0], max_mz=MZ_RANGE[0][1]) ri = UniformRTAndIntensitySampler(min_rt=RT_RANGE[0][0], max_rt=RT_RANGE[0][1]) cs = MGFMS2Sampler(MGF_FILE) cc = ChemicalMixtureCreator(hf, rt_and_intensity_sampler=ri, ms2_sampler=cs) d = cc.sample(N_CHEMS, 2) check_chems(d)
def fullscan_dataset(): np.random.seed(0) rand.seed(0) min_mz = MZ_RANGE[0][0] max_mz = MZ_RANGE[0][1] min_rt = RT_RANGE[0][0] max_rt = RT_RANGE[0][1] um = UniformMZFormulaSampler(min_mz=min_mz, max_mz=max_mz) ri = UniformRTAndIntensitySampler(min_rt=min_rt, max_rt=max_rt) cs = GaussianChromatogramSampler(sigma=100) cm = ChemicalMixtureCreator(um, rt_and_intensity_sampler=ri, chromatogram_sampler=cs) return cm.sample(N_CHEMS, 1)
def test_TopNDEW_agent(self): set_log_level_debug() fs = UniformMZFormulaSampler() ri = UniformRTAndIntensitySampler(min_rt=0, max_rt=80) cr = GaussianChromatogramSampler(sigma=1) ms = FixedMS2Sampler() cs = ChemicalMixtureCreator(fs, rt_and_intensity_sampler=ri, chromatogram_sampler=cr, ms2_sampler=ms) d = cs.sample(500, 2) ionisation_mode = POSITIVE # Example shows how the same Agent object can be used in consecutive controllers agent = TopNDEWAgent(ionisation_mode, 10, 0.7, 10, 15, 1500) controller = AgentBasedController(agent) spike_noise = UniformSpikeNoise(0.1, 1000) mass_spec = IndependentMassSpectrometer(ionisation_mode, d, spike_noise=spike_noise) env = Environment(mass_spec, controller, 0, 100, progress_bar=True) set_log_level_warning() env.run() check_non_empty_MS1(controller) check_non_empty_MS2(controller) check_mzML(env, OUT_DIR, 'shell.mzML') controller = AgentBasedController(agent) mass_spec = IndependentMassSpectrometer(ionisation_mode, d, spike_noise=spike_noise) env = Environment(mass_spec, controller, 0, 100, progress_bar=True) set_log_level_warning() env.run() check_non_empty_MS1(controller) check_non_empty_MS2(controller) check_mzML(env, OUT_DIR, 'shell2.mzML') controller = AgentBasedController(agent) mass_spec = IndependentMassSpectrometer(ionisation_mode, d, spike_noise=spike_noise) env = Environment(mass_spec, controller, 0, 100, progress_bar=True) set_log_level_warning() env.run() check_non_empty_MS1(controller) # check_non_empty_MS2(controller) # ms2 scans have been exhausted at this point check_mzML(env, OUT_DIR, 'shell3.mzML')
def generate_chems(cls, output_dir, no_chems): hmdb = load_obj(cls.c.HMDBPATH) df = DatabaseFormulaSampler(hmdb, min_mz=100, max_mz=1000) cm = ChemicalMixtureCreator(df, adduct_prior_dict={POSITIVE: { "M+H": 1 }}) chemicals = cm.sample(no_chems, 1) min_rt, max_rt = min(chem.rt for chem in chemicals) * 0.9, max( chem.rt for chem in chemicals) * 1.1 Path(output_dir).mkdir(exist_ok=True) with open(os.path.join(output_dir, "rts.txt"), 'w') as rts: rts.write("{},{}".format(min_rt, max_rt)) save_obj(chemicals, os.path.join(output_dir, "chems.pkl")) return chemicals, os.path.join(output_dir, "chems.pkl"), os.path.join( output_dir, "rts.txt")
def test_linked_ms1_ms2_creation(self): # make a database from an mgf database = mgf_to_database(MGF_FILE, id_field="SPECTRUMID") hd = DatabaseFormulaSampler(database) # ExactMatchMS2Sampler needs to be given the same mgf file # and both need to use the same field in the MGF as the unique ID mm = ExactMatchMS2Sampler(MGF_FILE, id_field="SPECTRUMID") cm = ChemicalMixtureCreator(hd, ms2_sampler=mm) dataset = cm.sample(N_CHEMS, 2) # check each chemical to see if it has the correct number of peaks records = load_mgf(MGF_FILE, id_field="SPECTRUMID") for chem in dataset: orig_spec = records[chem.database_accession] assert len(chem.children) > 0 assert len(orig_spec.peaks) == len(chem.children)
def test_aif_with_fixed_chems(self): fs = EvenMZFormulaSampler() ms = FixedMS2Sampler(n_frags=2) cs = ConstantChromatogramSampler() ri = UniformRTAndIntensitySampler(min_rt=0, max_rt=1) cs = ChemicalMixtureCreator(fs, ms2_sampler=ms, chromatogram_sampler=cs, rt_and_intensity_sampler=ri) d = cs.sample(1, 2) ms1_source_cid_energy = 30 controller = AIF(ms1_source_cid_energy) ionisation_mode = POSITIVE mass_spec = IndependentMassSpectrometer(ionisation_mode, d) env = Environment(mass_spec, controller, 10, 20, progress_bar=True) set_log_level_warning() env.run() for i, s in enumerate(controller.scans[1]): if i % 2 == 1: # odd scan, AIF, should have two peaks at 81 and 91 integer_mzs = [int(i) for i in s.mzs] integer_mzs.sort() assert integer_mzs[0] == 81 assert integer_mzs[1] == 91 else: # even scan, MS1 - should have a single peak at integer value of 101 integer_mzs = [int(i) for i in s.mzs] assert integer_mzs[0] == 101
def test_exclusion_simple_data(self): # three chemicals, both will get fragmented # first time around and exclusion such that neither # should be fragmented second time fs = EvenMZFormulaSampler() ch = ConstantChromatogramSampler() rti = UniformRTAndIntensitySampler(min_rt=0, max_rt=5) cs = ChemicalMixtureCreator(fs, chromatogram_sampler=ch, rt_and_intensity_sampler=rti) n_chems = 3 dataset = cs.sample(n_chems, 2) ionisation_mode = POSITIVE initial_exclusion_list = [] min_ms1_intensity = 0 N = 10 mz_tol = 10 rt_tol = 30 isolation_width = 1 all_controllers = [] for i in range(3): mass_spec = IndependentMassSpectrometer(ionisation_mode, dataset) controller = TopNController( ionisation_mode, N, isolation_width, mz_tol, rt_tol, min_ms1_intensity, initial_exclusion_list=initial_exclusion_list) env = Environment(mass_spec, controller, 0, 20, progress_bar=True) run_environment(env) mz_intervals = list( controller.exclusion.exclusion_list.boxes_mz.items()) rt_intervals = list( controller.exclusion.exclusion_list.boxes_rt.items()) unique_items_mz = set(i.data for i in mz_intervals) unique_items_rt = set(i.data for i in rt_intervals) assert len(unique_items_mz) == len(unique_items_rt) initial_exclusion_list = list(unique_items_mz) all_controllers.append(controller) assert len(all_controllers[0].scans[2]) == n_chems assert len(all_controllers[1].scans[2]) == 0 assert len(all_controllers[2].scans[2]) == 0
def test_targeted(self): fs = EvenMZFormulaSampler() ri = UniformRTAndIntensitySampler(min_rt=0, max_rt=10) cr = ConstantChromatogramSampler() ms = FixedMS2Sampler() cs = ChemicalMixtureCreator(fs, rt_and_intensity_sampler=ri, chromatogram_sampler=cr, ms2_sampler=ms) d = cs.sample(2, 2) # sample chems with m/z = 100 and 200 ionisation_mode = POSITIVE targets = [] targets.append(Target(101, 100, 102, 10, 20, adduct='M+H')) targets.append(Target(201, 200, 202, 10, 20, metadata={'a': 1})) ce_values = [10, 20, 30] n_replicates = 4 controller = TargetedController(targets, ce_values, n_replicates=n_replicates, limit_acquisition=True) mass_spec = IndependentMassSpectrometer(ionisation_mode, d) env = Environment(mass_spec, controller, 5, 25, progress_bar=True) set_log_level_warning() env.run() # check that we go all the scans we wanted for ms_level in controller.scans: assert len(controller.scans[ms_level]) > 0 set_log_level_debug() target_counts = {t: {c: 0 for c in ce_values} for t in targets} for s in controller.scans[2]: params = s.scan_params pmz = params.get(ScanParameters.PRECURSOR_MZ)[0].precursor_mz filtered_targets = list( filter( lambda x: (x.from_rt <= s.rt <= x.to_rt) and (x.from_mz <= pmz <= x.to_mz), targets)) assert len(filtered_targets) == 1 target = filtered_targets[0] ce = params.get(ScanParameters.COLLISION_ENERGY) target_counts[target][ce] += 1 for t in target_counts: for ce, count in target_counts[t].items(): assert count == n_replicates
def test_multiple_isolation(self): N = 3 fs = EvenMZFormulaSampler() ri = UniformRTAndIntensitySampler(min_rt=0, max_rt=10) cr = ConstantChromatogramSampler() ms = FixedMS2Sampler() cs = ChemicalMixtureCreator(fs, rt_and_intensity_sampler=ri, chromatogram_sampler=cr, ms2_sampler=ms) d = cs.sample(3, 2) # sample chems with m/z = 100 and 200 # ionisation_mode = POSITIVE controller = MultiIsolationController(N) ms = IndependentMassSpectrometer(POSITIVE, d) env = Environment(ms, controller, 10, 20, progress_bar=True) set_log_level_warning() env.run() assert len(controller.scans[1]) > 0 assert len(controller.scans[2]) > 0 # look at the first block of MS2 scans # and check that they are the correct super-positions mm = {} # first three scans hit the individual precursors mm[(0, )] = controller.scans[2][0] mm[(1, )] = controller.scans[2][1] mm[(2, )] = controller.scans[2][2] # next three should hit the pairs mm[(0, 1)] = controller.scans[2][3] mm[(0, 2)] = controller.scans[2][4] mm[(1, 2)] = controller.scans[2][5] # final should hit all three mm[(0, 1, 2)] = controller.scans[2][6] for key, value in mm.items(): actual_mz_vals = set(mm[key].mzs) expected_mz_vals = set() for k in key: for m in mm[(k, )].mzs: expected_mz_vals.add(m) assert expected_mz_vals == actual_mz_vals
def run_vimms(no_injections, rt_box_size, mz_box_size): rt_range = [(0, 1440)] min_rt, max_rt = rt_range[0] ionisation_mode, isolation_width = POSITIVE, 1 N, rt_tol, mz_tol, min_ms1_intensity = 10, 15, 10, 5000 min_roi_intensity, min_roi_length, min_roi_length_for_fragmentation = \ 500, 3, 3 grid = GridEstimator( LocatorGrid(min_rt, max_rt, rt_box_size, 0, 3000, mz_box_size), IdentityDrift()) hmdbpath = os.path.join(os.path.abspath(os.getcwd()), "..", "..", "tests", "fixtures", "hmdb_compounds.p") hmdb = load_obj(hmdbpath) df = DatabaseFormulaSampler(hmdb, min_mz=100, max_mz=1000) cm = ChemicalMixtureCreator(df, adduct_prior_dict={POSITIVE: {"M+H": 1}}) chemicals = cm.sample(2000, 1) boxes = [] for i in range(no_injections): mz_noise = GaussianPeakNoise(0.1) mass_spec = IndependentMassSpectrometer(POSITIVE, chemicals, mz_noise=mz_noise) controller = NonOverlapController( ionisation_mode, isolation_width, mz_tol, min_ms1_intensity, min_roi_intensity, min_roi_length, N, grid, rt_tol=rt_tol, min_roi_length_for_fragmentation=min_roi_length_for_fragmentation ) env = Environment(mass_spec, controller, min_rt, max_rt, progress_bar=True) set_log_level_warning() env.run() boxes.append( [r.to_box(0.01, 0.01) for r in controller.roi_builder.get_rois()]) return boxes
logger.debug("Loaded {} formulas".format(len(formula_database))) fs = DatabaseFormulaSampler(formula_database, min_mz=args.min_mz, max_mz=args.max_mz) ri = UniformRTAndIntensitySampler( min_rt=args.min_rt, max_rt=args.max_rt, min_log_intensity=np.log(args.min_ms1_sampling_intensity), max_log_intensity=np.log(args.max_ms1_sampling_intensity)) cs = UniformMS2Sampler() cm = ChemicalMixtureCreator(fs, rt_and_intensity_sampler=ri, ms2_sampler=cs, adduct_prior_dict=ADDUCT_DICT_POS_MH) dataset = cm.sample(args.n_chems, args.ms_levels) if args.print_chems: logger.debug("Sampled chems") for chem in dataset: logger.debug(chem) if args.output_msp_file is not None: write_msp(dataset, args.output_msp_file) spike_noise = UniformSpikeNoise(0.01, args.spike_max) ms = IndependentMassSpectrometer(POSITIVE_IONISATION_MODE,
def test_ms2_matching(self): rti = UniformRTAndIntensitySampler(min_rt=10, max_rt=20) fs = UniformMZFormulaSampler() adduct_prior_dict = {POSITIVE: {'M+H': 1}} cs = ChemicalMixtureCreator(fs, rt_and_intensity_sampler=rti, adduct_prior_dict=adduct_prior_dict) d = cs.sample(300, 2) group_list = ['control', 'control', 'case', 'case'] group_dict = {} group_dict['control'] = { 'missing_probability': 0.0, 'changing_probability': 0.0 } group_dict['case'] = { 'missing_probability': 0.0, 'changing_probability': 1.0 } mm = MultipleMixtureCreator(d, group_list, group_dict) cl = mm.generate_chemical_lists() N = 10 isolation_width = 0.7 mz_tol = 0.001 rt_tol = 30 min_ms1_intensity = 0 set_log_level_warning() output_folder = os.path.join(OUT_DIR, 'ms2_matching') write_msp(d, 'mmm.msp', out_dir=output_folder) initial_exclusion_list = [] for i, chem_list in enumerate(cl): controller = TopNController( POSITIVE, N, isolation_width, mz_tol, rt_tol, min_ms1_intensity, initial_exclusion_list=initial_exclusion_list) ms = IndependentMassSpectrometer(POSITIVE, chem_list) env = Environment(ms, controller, 10, 30, progress_bar=True) env.run() env.write_mzML(output_folder, '{}.mzML'.format(i)) mz_intervals = list( controller.exclusion.exclusion_list.boxes_mz.items()) rt_intervals = list( controller.exclusion.exclusion_list.boxes_rt.items()) unique_items_mz = set(i.data for i in mz_intervals) unique_items_rt = set(i.data for i in rt_intervals) assert len(unique_items_mz) == len(unique_items_rt) initial_exclusion_list = list(unique_items_mz) logger.warning(len(initial_exclusion_list)) set_log_level_debug() msp_file = os.path.join(output_folder, 'mmm.msp') # check with just the first file a, b = ms2_main(os.path.join(output_folder, '0.mzML'), msp_file, 1, 0.7) # check with all c, d = ms2_main(output_folder, os.path.join(output_folder, 'mmm.msp'), 1, 0.7) assert b == d assert c > a
def test_multiple_chems(self): hf = DatabaseFormulaSampler(HMDB, min_mz=MZ_RANGE[0][0], max_mz=MZ_RANGE[0][1]) ri = UniformRTAndIntensitySampler(min_rt=RT_RANGE[0][0], max_rt=RT_RANGE[0][1]) cc = ChemicalMixtureCreator(hf, rt_and_intensity_sampler=ri) d = cc.sample(N_CHEMS, 2) group_list = ['control', 'control', 'case', 'case'] group_dict = { 'case': { 'missing_probability': 0, 'changing_probability': 0 } } # missing noise peak_noise = NoPeakNoise() mm = MultipleMixtureCreator(d, group_list, group_dict, intensity_noise=peak_noise) cl = mm.generate_chemical_lists() for c in cl: check_chems(c) # with these settings all chemicals should be in all lists with identical intensities originals = [f.base_chemical for f in c] assert len(set(originals)) == len(d) for f in c: assert f.max_intensity == f.base_chemical.max_intensity group_dict = { 'case': { 'missing_probability': 1., 'changing_probability': 0 } } mm = MultipleMixtureCreator(d, group_list, group_dict, intensity_noise=peak_noise) cl = mm.generate_chemical_lists() for i, c in enumerate(cl): if group_list[i] == 'case': assert len(c) == 0 # test the case that if the missing probability is 1 all are missing group_dict = { 'case': { 'missing_probability': 1., 'changing_probability': 0 } } mm = MultipleMixtureCreator(d, group_list, group_dict, intensity_noise=peak_noise) cl = mm.generate_chemical_lists() for i, c in enumerate(cl): if group_list[i] == 'case': assert len(c) == 0 # test the case that changing probablity is 1 changes everything group_dict = { 'case': { 'missing_probability': 0., 'changing_probability': 1. } } mm = MultipleMixtureCreator(d, group_list, group_dict, intensity_noise=peak_noise) cl = mm.generate_chemical_lists() for i, c in enumerate(cl): if group_list[i] == 'case': for f in c: assert not f.max_intensity == f.base_chemical.max_intensity