Ejemplo n.º 1
0
    def test_multiple_adducts(self):
        fs = DatabaseFormulaSampler(HMDB)
        ri = UniformRTAndIntensitySampler(min_rt=100, max_rt=101)
        cs = ConstantChromatogramSampler()
        adduct_prior_dict = {POSITIVE: {'M+H': 100, 'M+Na': 100, 'M+K': 100}}
        cm = ChemicalMixtureCreator(fs,
                                    rt_and_intensity_sampler=ri,
                                    chromatogram_sampler=cs,
                                    adduct_prior_dict=adduct_prior_dict,
                                    adduct_proportion_cutoff=0.0)

        n_adducts = len(adduct_prior_dict[POSITIVE])
        n_chems = 5
        dataset = cm.sample(n_chems, 2)

        for c in dataset:
            c.isotopes = [(c.mass, 1, "Mono")]

        # should be 15 peaks or less all the time
        # some adducts might not be sampled if the probability is less than 0.2
        controller = SimpleMs1Controller()
        ms = IndependentMassSpectrometer(POSITIVE, dataset)
        env = Environment(ms, controller, 102, 110, progress_bar=True)
        set_log_level_warning()
        env.run()
        for scan in controller.scans[1]:
            assert len(scan.mzs) <= n_chems * n_adducts
Ejemplo n.º 2
0
def simple_dataset():
    ri = UniformRTAndIntensitySampler(min_rt=RT_RANGE[0][0],
                                      max_rt=RT_RANGE[0][1])
    hf = DatabaseFormulaSampler(HMDB)
    cc = ChemicalMixtureCreator(hf,
                                rt_and_intensity_sampler=ri,
                                adduct_prior_dict=ADDUCT_DICT_POS_MH)
    d = cc.sample(N_CHEMS, 2)
    return d
Ejemplo n.º 3
0
    def test_hmdb_creation(self):

        ri = UniformRTAndIntensitySampler(min_rt=RT_RANGE[0][0],
                                          max_rt=RT_RANGE[0][1])
        hf = DatabaseFormulaSampler(HMDB,
                                    min_mz=MZ_RANGE[0][0],
                                    max_mz=MZ_RANGE[0][1])
        cc = ChemicalMixtureCreator(hf, rt_and_intensity_sampler=ri)
        d = cc.sample(N_CHEMS, 2)

        check_chems(d)
Ejemplo n.º 4
0
 def test_ms2_mgf(self):
     hf = DatabaseFormulaSampler(HMDB,
                                 min_mz=MZ_RANGE[0][0],
                                 max_mz=MZ_RANGE[0][1])
     ri = UniformRTAndIntensitySampler(min_rt=RT_RANGE[0][0],
                                       max_rt=RT_RANGE[0][1])
     cs = MGFMS2Sampler(MGF_FILE)
     cc = ChemicalMixtureCreator(hf,
                                 rt_and_intensity_sampler=ri,
                                 ms2_sampler=cs)
     d = cc.sample(N_CHEMS, 2)
     check_chems(d)
Ejemplo n.º 5
0
    def test_linked_ms1_ms2_creation(self):
        # make a database from an mgf
        database = mgf_to_database(MGF_FILE, id_field="SPECTRUMID")
        hd = DatabaseFormulaSampler(database)
        # ExactMatchMS2Sampler needs to be given the same mgf file
        # and both need to use the same field in the MGF as the unique ID
        mm = ExactMatchMS2Sampler(MGF_FILE, id_field="SPECTRUMID")
        cm = ChemicalMixtureCreator(hd, ms2_sampler=mm)
        dataset = cm.sample(N_CHEMS, 2)

        # check each chemical to see if it has the correct number of peaks
        records = load_mgf(MGF_FILE, id_field="SPECTRUMID")
        for chem in dataset:
            orig_spec = records[chem.database_accession]
            assert len(chem.children) > 0
            assert len(orig_spec.peaks) == len(chem.children)
Ejemplo n.º 6
0
 def generate_chems(cls, output_dir, no_chems):
     hmdb = load_obj(cls.c.HMDBPATH)
     df = DatabaseFormulaSampler(hmdb, min_mz=100, max_mz=1000)
     cm = ChemicalMixtureCreator(df,
                                 adduct_prior_dict={POSITIVE: {
                                     "M+H": 1
                                 }})
     chemicals = cm.sample(no_chems, 1)
     min_rt, max_rt = min(chem.rt for chem in chemicals) * 0.9, max(
         chem.rt for chem in chemicals) * 1.1
     Path(output_dir).mkdir(exist_ok=True)
     with open(os.path.join(output_dir, "rts.txt"), 'w') as rts:
         rts.write("{},{}".format(min_rt, max_rt))
     save_obj(chemicals, os.path.join(output_dir, "chems.pkl"))
     return chemicals, os.path.join(output_dir, "chems.pkl"), os.path.join(
         output_dir, "rts.txt")
Ejemplo n.º 7
0
def run_vimms(no_injections, rt_box_size, mz_box_size):
    rt_range = [(0, 1440)]
    min_rt, max_rt = rt_range[0]
    ionisation_mode, isolation_width = POSITIVE, 1
    N, rt_tol, mz_tol, min_ms1_intensity = 10, 15, 10, 5000
    min_roi_intensity, min_roi_length, min_roi_length_for_fragmentation = \
        500, 3, 3
    grid = GridEstimator(
        LocatorGrid(min_rt, max_rt, rt_box_size, 0, 3000, mz_box_size),
        IdentityDrift())

    hmdbpath = os.path.join(os.path.abspath(os.getcwd()), "..", "..", "tests",
                            "fixtures", "hmdb_compounds.p")
    hmdb = load_obj(hmdbpath)
    df = DatabaseFormulaSampler(hmdb, min_mz=100, max_mz=1000)
    cm = ChemicalMixtureCreator(df, adduct_prior_dict={POSITIVE: {"M+H": 1}})
    chemicals = cm.sample(2000, 1)

    boxes = []
    for i in range(no_injections):
        mz_noise = GaussianPeakNoise(0.1)
        mass_spec = IndependentMassSpectrometer(POSITIVE, chemicals,
                                                mz_noise=mz_noise)
        controller = NonOverlapController(
            ionisation_mode, isolation_width, mz_tol, min_ms1_intensity,
            min_roi_intensity,
            min_roi_length, N, grid, rt_tol=rt_tol,
            min_roi_length_for_fragmentation=min_roi_length_for_fragmentation
        )
        env = Environment(mass_spec, controller, min_rt, max_rt,
                          progress_bar=True)
        set_log_level_warning()
        env.run()
        boxes.append(
            [r.to_box(0.01, 0.01) for r in controller.roi_builder.get_rois()])
    return boxes
Ejemplo n.º 8
0
    def test_multiple_chems(self):
        hf = DatabaseFormulaSampler(HMDB,
                                    min_mz=MZ_RANGE[0][0],
                                    max_mz=MZ_RANGE[0][1])
        ri = UniformRTAndIntensitySampler(min_rt=RT_RANGE[0][0],
                                          max_rt=RT_RANGE[0][1])
        cc = ChemicalMixtureCreator(hf, rt_and_intensity_sampler=ri)
        d = cc.sample(N_CHEMS, 2)

        group_list = ['control', 'control', 'case', 'case']
        group_dict = {
            'case': {
                'missing_probability': 0,
                'changing_probability': 0
            }
        }

        # missing noise
        peak_noise = NoPeakNoise()

        mm = MultipleMixtureCreator(d,
                                    group_list,
                                    group_dict,
                                    intensity_noise=peak_noise)

        cl = mm.generate_chemical_lists()

        for c in cl:
            check_chems(c)
            # with these settings all chemicals should be in all lists with identical intensities
            originals = [f.base_chemical for f in c]
            assert len(set(originals)) == len(d)
            for f in c:
                assert f.max_intensity == f.base_chemical.max_intensity

        group_dict = {
            'case': {
                'missing_probability': 1.,
                'changing_probability': 0
            }
        }

        mm = MultipleMixtureCreator(d,
                                    group_list,
                                    group_dict,
                                    intensity_noise=peak_noise)

        cl = mm.generate_chemical_lists()
        for i, c in enumerate(cl):
            if group_list[i] == 'case':
                assert len(c) == 0

        # test the case that if the missing probability is 1 all are missing
        group_dict = {
            'case': {
                'missing_probability': 1.,
                'changing_probability': 0
            }
        }

        mm = MultipleMixtureCreator(d,
                                    group_list,
                                    group_dict,
                                    intensity_noise=peak_noise)

        cl = mm.generate_chemical_lists()
        for i, c in enumerate(cl):
            if group_list[i] == 'case':
                assert len(c) == 0

        # test the case that changing probablity is 1 changes everything
        group_dict = {
            'case': {
                'missing_probability': 0.,
                'changing_probability': 1.
            }
        }

        mm = MultipleMixtureCreator(d,
                                    group_list,
                                    group_dict,
                                    intensity_noise=peak_noise)

        cl = mm.generate_chemical_lists()
        for i, c in enumerate(cl):
            if group_list[i] == 'case':
                for f in c:
                    assert not f.max_intensity == f.base_chemical.max_intensity
Ejemplo n.º 9
0
    parser.add_argument('--output_swath_file',
                        dest='output_swath_file',
                        type=str,
                        default=None)
    parser.add_argument('--print_chems',
                        dest='print_chems',
                        action='store_true')

    args = parser.parse_args()

    formula_database = load_obj(args.formula_database_file)

    logger.debug("Loaded {} formulas".format(len(formula_database)))

    fs = DatabaseFormulaSampler(formula_database,
                                min_mz=args.min_mz,
                                max_mz=args.max_mz)

    ri = UniformRTAndIntensitySampler(
        min_rt=args.min_rt,
        max_rt=args.max_rt,
        min_log_intensity=np.log(args.min_ms1_sampling_intensity),
        max_log_intensity=np.log(args.max_ms1_sampling_intensity))
    cs = UniformMS2Sampler()

    cm = ChemicalMixtureCreator(fs,
                                rt_and_intensity_sampler=ri,
                                ms2_sampler=cs,
                                adduct_prior_dict=ADDUCT_DICT_POS_MH)

    dataset = cm.sample(args.n_chems, args.ms_levels)