def test_search(mock_fetch_formulas, spark_context, ds_config): formulas = ['H2O', 'C5H3O'] adducts = ds_config['isotope_generation']['adducts'] mock_fetch_formulas.side_effect = lambda moldb_id: formulas with TemporaryDirectory() as tmpdir: ds_data_path = Path(tmpdir) msm_search = MSMSearch( spark_context, make_imzml_reader_mock(), [MolecularDB(0, 'tests_db', 'version', targeted=True)], ds_config, ds_data_path, NullProfiler(), ) msm_search._fetch_formula_centroids = make_fetch_formula_centroids_mock() msm_search.process_segments = lambda centr_segm_n, func: spark_context.parallelize( map(func, range(centr_segm_n)) ) moldb_ion_metrics_df, moldb_ion_images_rdd, fdr_bundle = next(msm_search.search()) assert len(moldb_ion_metrics_df) == len(formulas) * len(adducts) assert moldb_ion_images_rdd.count() == len(formulas) * len(adducts) sanity_check_fdr_diagnostics(fdr_bundle)
def test_init_fdr(self, fetch_formulas_mock): ds_config = { 'analysis_version': 1, 'fdr': { 'decoy_sample_size': 20 }, 'isotope_generation': BASIC_ISOTOPE_GENERATION_CONFIG, } moldb_fdr_list = init_fdr(ds_config, [MolecularDB(0, 'test_db', 'version')]) assert len(moldb_fdr_list) == 1 _, fdr = moldb_fdr_list[0] assert not fdr.td_df.empty
def test_compute_fdr_and_filter_results(targeted, exp_annot_n, spark_context): moldb = MolecularDB(0, 'test_db', 'version', targeted=targeted) fdr, ion_formula_map_df, formula_metrics_df, formula_images_rdd = make_search_results( spark_context ) moldb_ion_metrics_df, moldb_ion_images_rdd, fdr_bundle = compute_fdr_and_filter_results( moldb, fdr, ion_formula_map_df, formula_metrics_df, formula_images_rdd, None ) assert moldb_ion_metrics_df.shape[0] == exp_annot_n assert moldb_ion_images_rdd.count() == exp_annot_n assert len(fdr_bundle['metrics_df']) >= exp_annot_n sanity_check_fdr_diagnostics(fdr_bundle)
def test_compute_fdr(spark_context, ds_config): moldb_fdr_list = init_fdr(ds_config, [MolecularDB(0, 'test_db', 'version')]) _, fdr = moldb_fdr_list[0] formula_map_df = collect_ion_formulas(spark_context, moldb_fdr_list).drop('moldb_id', axis=1) formula_metrics_df = pd.DataFrame( [(10, 'H3O', 0.99), (11, 'C5H4O', 0.5), (12, 'H2ONa', 0.1)], columns=['formula_i', 'ion_formula', 'msm'], ).set_index('formula_i') metrics_df = compute_fdr(fdr, formula_metrics_df, formula_map_df, None) assert len(metrics_df) == 3 assert sorted(metrics_df.columns.tolist()) == sorted( ['ion_formula', 'msm', 'formula', 'modifier', 'fdr'] )
def test_decoy_sample_size_30(self, fetch_formulas_mock, spark_context): ds_config = { 'analysis_version': 1, 'fdr': { 'decoy_sample_size': 30 }, 'isotope_generation': BASIC_ISOTOPE_GENERATION_CONFIG, } moldb_fdr_list = init_fdr(ds_config, [MolecularDB(0, 'test_db', 'version')]) df = collect_ion_formulas(spark_context, moldb_fdr_list) assert df.columns.tolist() == [ 'moldb_id', 'ion_formula', 'formula', 'modifier' ] assert df.shape == (62, 4)
def test_neutral_losses_and_chem_mods(self, fetch_formulas_mock, spark_context): ds_config = { 'analysis_version': 1, 'fdr': { 'decoy_sample_size': 1 }, 'isotope_generation': FULL_ISOTOPE_GENERATION_CONFIG, } moldb_fdr_list = init_fdr(ds_config, [MolecularDB(0, 'test_db', 'version')]) df = collect_ion_formulas(spark_context, moldb_fdr_list) assert df.columns.tolist() == [ 'moldb_id', 'ion_formula', 'formula', 'modifier' ] # 2 formulas * (4 target adducts + (4 target adducts * 1 decoy adducts per target adduct) # * (no loss + 2 neutral losses) * (no mod + 1 chem mod) = 2 * (4 + 4) * 3 * 2 = 96 assert df.shape == (96, 4)
def test_ambiguous_modifiers( fetch_formulas_mock, formula_image_metrics_mock, spark_context, ds_config ): with TemporaryDirectory() as tmpdir: ds_data_path = Path(tmpdir) print(ds_data_path) ds_config = { **ds_config, "isotope_generation": { **ds_config["isotope_generation"], # This set of modifiers are deliberately chosen so that ('','-H2O','+H') and ('-H2O+H','','') produce the same # modifier string, to test that no code accidentally relies on "modifier" or "ion" strings being unambiguous "chem_mods": ["-H2O+H"], "neutral_losses": ["-H2O"], "adducts": ["+H", "[M]+"], }, } formulas = [ 'H3O', 'H4O', 'H5O2', 'H6O2', ] # Formulae selected to create isomers with the above modifiers fetch_formulas_mock.return_value = formulas msm_search = MSMSearch( spark_context, make_imzml_reader_mock(), [MolecularDB(0, 'test_db', 'version', targeted=True)], ds_config, ds_data_path, NullProfiler(), ) msm_search._fetch_formula_centroids = make_fetch_formula_centroids_mock() msm_search.process_segments = lambda centr_segm_n, func: spark_context.parallelize( map(func, range(centr_segm_n)) ) formula_image_metrics_mock.side_effect = make_formula_image_metrics_mock_side_effect() moldb_ion_metrics_df, _, fdr_bundle = next(msm_search.search()) assert ( moldb_ion_metrics_df[['formula', 'chem_mod', 'neutral_loss', 'adduct']] .duplicated() .sum() == 0 ) # There are 4 combinations of modifiers to get H2: (H3O,-H2O+H,,), (H3O,,-H2O,+H), (H4O,,-H2O,), (H5O2,-H2O+H,-H2O,) assert len(moldb_ion_metrics_df[moldb_ion_metrics_df.ion_formula == 'H2']) == 4 # Only 1 combination of modifiers can create H7O2: (H6O2,,,+H) assert len(moldb_ion_metrics_df[moldb_ion_metrics_df.ion_formula == 'H7O2']) == 1 # H5O2 and H6O2 can have all combinations: 2 neutral loss options, 2 chem mods, 2 adducts = 8 possible combinations assert len(moldb_ion_metrics_df[moldb_ion_metrics_df.formula == 'H5O2']) == 8 assert len(moldb_ion_metrics_df[moldb_ion_metrics_df.formula == 'H6O2']) == 8 # H3O and H4O cannot simultaneously have -H2O and -H2O+H assert len(moldb_ion_metrics_df[moldb_ion_metrics_df.formula == 'H3O']) == 6 assert len(moldb_ion_metrics_df[moldb_ion_metrics_df.formula == 'H4O']) == 6 assert moldb_ion_metrics_df.formula.isin(formulas).all() sanity_check_fdr_diagnostics(fdr_bundle)
def test_delete_ds__completely(sm_config, test_db, es, sm_index): moldb = MolecularDB(0, 'HMDB', '2016') moldb2 = MolecularDB(1, 'ChEBI', '2016') index = sm_config['elasticsearch']['index'] es.create( index=index, doc_type='annotation', id='id1', body={ 'ds_id': 'dataset1', 'db_id': moldb.id, 'db_name': moldb.name, 'db_version': moldb.version, }, ) es.create( index=index, doc_type='annotation', id='id2', body={ 'ds_id': 'dataset1', 'db_id': moldb2.id, 'db_name': moldb2.name, 'db_version': moldb2.version, }, ) es.create( index=index, doc_type='annotation', id='id3', body={ 'ds_id': 'dataset2', 'db_id': moldb.id, 'db_name': moldb.name, 'db_version': moldb.version, }, ) es.create( index=index, doc_type='dataset', id='dataset1', body={ 'ds_id': 'dataset1', 'db_id': moldb.id, 'db_name': moldb.name, 'db_version': moldb.version, }, ) wait_for_es(es, index) db_mock = MagicMock(spec=DB) es_exporter = ESExporter(db_mock, sm_config) es_exporter.delete_ds(ds_id='dataset1') wait_for_es(es, index) body = {'query': {'bool': {'filter': []}}} body['query']['bool']['filter'] = [ { 'term': { 'ds_id': 'dataset1' } }, { 'term': { 'db_id': moldb.id } }, ] assert es.count(index=index, doc_type='annotation', body=body)['count'] == 0 body['query']['bool']['filter'] = [ { 'term': { 'ds_id': 'dataset1' } }, { 'term': { 'db_id': moldb2.id } }, ] assert es.count(index=index, doc_type='annotation', body=body)['count'] == 0 body['query']['bool']['filter'] = [ { 'term': { 'ds_id': 'dataset2' } }, { 'term': { 'db_id': moldb.id } }, ] assert es.count(index=index, doc_type='annotation', body=body)['count'] == 1 body['query']['bool']['filter'] = [ { 'term': { 'ds_id': 'dataset1' } }, { 'term': { '_type': 'dataset' } }, ] assert es.count(index=index, doc_type='dataset', body=body)['count'] == 0