def test_multiple_adducts(self): fs = DatabaseFormulaSampler(HMDB) ri = UniformRTAndIntensitySampler(min_rt=100, max_rt=101) cs = ConstantChromatogramSampler() adduct_prior_dict = {POSITIVE: {'M+H': 100, 'M+Na': 100, 'M+K': 100}} cm = ChemicalMixtureCreator(fs, rt_and_intensity_sampler=ri, chromatogram_sampler=cs, adduct_prior_dict=adduct_prior_dict, adduct_proportion_cutoff=0.0) n_adducts = len(adduct_prior_dict[POSITIVE]) n_chems = 5 dataset = cm.sample(n_chems, 2) for c in dataset: c.isotopes = [(c.mass, 1, "Mono")] # should be 15 peaks or less all the time # some adducts might not be sampled if the probability is less than 0.2 controller = SimpleMs1Controller() ms = IndependentMassSpectrometer(POSITIVE, dataset) env = Environment(ms, controller, 102, 110, progress_bar=True) set_log_level_warning() env.run() for scan in controller.scans[1]: assert len(scan.mzs) <= n_chems * n_adducts
def simple_dataset(): ri = UniformRTAndIntensitySampler(min_rt=RT_RANGE[0][0], max_rt=RT_RANGE[0][1]) hf = DatabaseFormulaSampler(HMDB) cc = ChemicalMixtureCreator(hf, rt_and_intensity_sampler=ri, adduct_prior_dict=ADDUCT_DICT_POS_MH) d = cc.sample(N_CHEMS, 2) return d
def test_hmdb_creation(self): ri = UniformRTAndIntensitySampler(min_rt=RT_RANGE[0][0], max_rt=RT_RANGE[0][1]) hf = DatabaseFormulaSampler(HMDB, min_mz=MZ_RANGE[0][0], max_mz=MZ_RANGE[0][1]) cc = ChemicalMixtureCreator(hf, rt_and_intensity_sampler=ri) d = cc.sample(N_CHEMS, 2) check_chems(d)
def test_ms2_mgf(self): hf = DatabaseFormulaSampler(HMDB, min_mz=MZ_RANGE[0][0], max_mz=MZ_RANGE[0][1]) ri = UniformRTAndIntensitySampler(min_rt=RT_RANGE[0][0], max_rt=RT_RANGE[0][1]) cs = MGFMS2Sampler(MGF_FILE) cc = ChemicalMixtureCreator(hf, rt_and_intensity_sampler=ri, ms2_sampler=cs) d = cc.sample(N_CHEMS, 2) check_chems(d)
def test_linked_ms1_ms2_creation(self): # make a database from an mgf database = mgf_to_database(MGF_FILE, id_field="SPECTRUMID") hd = DatabaseFormulaSampler(database) # ExactMatchMS2Sampler needs to be given the same mgf file # and both need to use the same field in the MGF as the unique ID mm = ExactMatchMS2Sampler(MGF_FILE, id_field="SPECTRUMID") cm = ChemicalMixtureCreator(hd, ms2_sampler=mm) dataset = cm.sample(N_CHEMS, 2) # check each chemical to see if it has the correct number of peaks records = load_mgf(MGF_FILE, id_field="SPECTRUMID") for chem in dataset: orig_spec = records[chem.database_accession] assert len(chem.children) > 0 assert len(orig_spec.peaks) == len(chem.children)
def generate_chems(cls, output_dir, no_chems): hmdb = load_obj(cls.c.HMDBPATH) df = DatabaseFormulaSampler(hmdb, min_mz=100, max_mz=1000) cm = ChemicalMixtureCreator(df, adduct_prior_dict={POSITIVE: { "M+H": 1 }}) chemicals = cm.sample(no_chems, 1) min_rt, max_rt = min(chem.rt for chem in chemicals) * 0.9, max( chem.rt for chem in chemicals) * 1.1 Path(output_dir).mkdir(exist_ok=True) with open(os.path.join(output_dir, "rts.txt"), 'w') as rts: rts.write("{},{}".format(min_rt, max_rt)) save_obj(chemicals, os.path.join(output_dir, "chems.pkl")) return chemicals, os.path.join(output_dir, "chems.pkl"), os.path.join( output_dir, "rts.txt")
def run_vimms(no_injections, rt_box_size, mz_box_size): rt_range = [(0, 1440)] min_rt, max_rt = rt_range[0] ionisation_mode, isolation_width = POSITIVE, 1 N, rt_tol, mz_tol, min_ms1_intensity = 10, 15, 10, 5000 min_roi_intensity, min_roi_length, min_roi_length_for_fragmentation = \ 500, 3, 3 grid = GridEstimator( LocatorGrid(min_rt, max_rt, rt_box_size, 0, 3000, mz_box_size), IdentityDrift()) hmdbpath = os.path.join(os.path.abspath(os.getcwd()), "..", "..", "tests", "fixtures", "hmdb_compounds.p") hmdb = load_obj(hmdbpath) df = DatabaseFormulaSampler(hmdb, min_mz=100, max_mz=1000) cm = ChemicalMixtureCreator(df, adduct_prior_dict={POSITIVE: {"M+H": 1}}) chemicals = cm.sample(2000, 1) boxes = [] for i in range(no_injections): mz_noise = GaussianPeakNoise(0.1) mass_spec = IndependentMassSpectrometer(POSITIVE, chemicals, mz_noise=mz_noise) controller = NonOverlapController( ionisation_mode, isolation_width, mz_tol, min_ms1_intensity, min_roi_intensity, min_roi_length, N, grid, rt_tol=rt_tol, min_roi_length_for_fragmentation=min_roi_length_for_fragmentation ) env = Environment(mass_spec, controller, min_rt, max_rt, progress_bar=True) set_log_level_warning() env.run() boxes.append( [r.to_box(0.01, 0.01) for r in controller.roi_builder.get_rois()]) return boxes
def test_multiple_chems(self): hf = DatabaseFormulaSampler(HMDB, min_mz=MZ_RANGE[0][0], max_mz=MZ_RANGE[0][1]) ri = UniformRTAndIntensitySampler(min_rt=RT_RANGE[0][0], max_rt=RT_RANGE[0][1]) cc = ChemicalMixtureCreator(hf, rt_and_intensity_sampler=ri) d = cc.sample(N_CHEMS, 2) group_list = ['control', 'control', 'case', 'case'] group_dict = { 'case': { 'missing_probability': 0, 'changing_probability': 0 } } # missing noise peak_noise = NoPeakNoise() mm = MultipleMixtureCreator(d, group_list, group_dict, intensity_noise=peak_noise) cl = mm.generate_chemical_lists() for c in cl: check_chems(c) # with these settings all chemicals should be in all lists with identical intensities originals = [f.base_chemical for f in c] assert len(set(originals)) == len(d) for f in c: assert f.max_intensity == f.base_chemical.max_intensity group_dict = { 'case': { 'missing_probability': 1., 'changing_probability': 0 } } mm = MultipleMixtureCreator(d, group_list, group_dict, intensity_noise=peak_noise) cl = mm.generate_chemical_lists() for i, c in enumerate(cl): if group_list[i] == 'case': assert len(c) == 0 # test the case that if the missing probability is 1 all are missing group_dict = { 'case': { 'missing_probability': 1., 'changing_probability': 0 } } mm = MultipleMixtureCreator(d, group_list, group_dict, intensity_noise=peak_noise) cl = mm.generate_chemical_lists() for i, c in enumerate(cl): if group_list[i] == 'case': assert len(c) == 0 # test the case that changing probablity is 1 changes everything group_dict = { 'case': { 'missing_probability': 0., 'changing_probability': 1. } } mm = MultipleMixtureCreator(d, group_list, group_dict, intensity_noise=peak_noise) cl = mm.generate_chemical_lists() for i, c in enumerate(cl): if group_list[i] == 'case': for f in c: assert not f.max_intensity == f.base_chemical.max_intensity
parser.add_argument('--output_swath_file', dest='output_swath_file', type=str, default=None) parser.add_argument('--print_chems', dest='print_chems', action='store_true') args = parser.parse_args() formula_database = load_obj(args.formula_database_file) logger.debug("Loaded {} formulas".format(len(formula_database))) fs = DatabaseFormulaSampler(formula_database, min_mz=args.min_mz, max_mz=args.max_mz) ri = UniformRTAndIntensitySampler( min_rt=args.min_rt, max_rt=args.max_rt, min_log_intensity=np.log(args.min_ms1_sampling_intensity), max_log_intensity=np.log(args.max_ms1_sampling_intensity)) cs = UniformMS2Sampler() cm = ChemicalMixtureCreator(fs, rt_and_intensity_sampler=ri, ms2_sampler=cs, adduct_prior_dict=ADDUCT_DICT_POS_MH) dataset = cm.sample(args.n_chems, args.ms_levels)