コード例 #1
0
def test_search(mock_fetch_formulas, spark_context, ds_config):
    formulas = ['H2O', 'C5H3O']
    adducts = ds_config['isotope_generation']['adducts']
    mock_fetch_formulas.side_effect = lambda moldb_id: formulas
    with TemporaryDirectory() as tmpdir:
        ds_data_path = Path(tmpdir)

        msm_search = MSMSearch(
            spark_context,
            make_imzml_reader_mock(),
            [MolecularDB(0, 'tests_db', 'version', targeted=True)],
            ds_config,
            ds_data_path,
            NullProfiler(),
        )
        msm_search._fetch_formula_centroids = make_fetch_formula_centroids_mock()

        msm_search.process_segments = lambda centr_segm_n, func: spark_context.parallelize(
            map(func, range(centr_segm_n))
        )

        moldb_ion_metrics_df, moldb_ion_images_rdd, fdr_bundle = next(msm_search.search())

        assert len(moldb_ion_metrics_df) == len(formulas) * len(adducts)
        assert moldb_ion_images_rdd.count() == len(formulas) * len(adducts)
        sanity_check_fdr_diagnostics(fdr_bundle)
コード例 #2
0
    def test_init_fdr(self, fetch_formulas_mock):
        ds_config = {
            'analysis_version': 1,
            'fdr': {
                'decoy_sample_size': 20
            },
            'isotope_generation': BASIC_ISOTOPE_GENERATION_CONFIG,
        }
        moldb_fdr_list = init_fdr(ds_config,
                                  [MolecularDB(0, 'test_db', 'version')])

        assert len(moldb_fdr_list) == 1
        _, fdr = moldb_fdr_list[0]
        assert not fdr.td_df.empty
コード例 #3
0
def test_compute_fdr_and_filter_results(targeted, exp_annot_n, spark_context):
    moldb = MolecularDB(0, 'test_db', 'version', targeted=targeted)
    fdr, ion_formula_map_df, formula_metrics_df, formula_images_rdd = make_search_results(
        spark_context
    )

    moldb_ion_metrics_df, moldb_ion_images_rdd, fdr_bundle = compute_fdr_and_filter_results(
        moldb, fdr, ion_formula_map_df, formula_metrics_df, formula_images_rdd, None
    )

    assert moldb_ion_metrics_df.shape[0] == exp_annot_n
    assert moldb_ion_images_rdd.count() == exp_annot_n
    assert len(fdr_bundle['metrics_df']) >= exp_annot_n
    sanity_check_fdr_diagnostics(fdr_bundle)
コード例 #4
0
def test_compute_fdr(spark_context, ds_config):
    moldb_fdr_list = init_fdr(ds_config, [MolecularDB(0, 'test_db', 'version')])
    _, fdr = moldb_fdr_list[0]
    formula_map_df = collect_ion_formulas(spark_context, moldb_fdr_list).drop('moldb_id', axis=1)

    formula_metrics_df = pd.DataFrame(
        [(10, 'H3O', 0.99), (11, 'C5H4O', 0.5), (12, 'H2ONa', 0.1)],
        columns=['formula_i', 'ion_formula', 'msm'],
    ).set_index('formula_i')

    metrics_df = compute_fdr(fdr, formula_metrics_df, formula_map_df, None)

    assert len(metrics_df) == 3
    assert sorted(metrics_df.columns.tolist()) == sorted(
        ['ion_formula', 'msm', 'formula', 'modifier', 'fdr']
    )
コード例 #5
0
    def test_decoy_sample_size_30(self, fetch_formulas_mock, spark_context):
        ds_config = {
            'analysis_version': 1,
            'fdr': {
                'decoy_sample_size': 30
            },
            'isotope_generation': BASIC_ISOTOPE_GENERATION_CONFIG,
        }
        moldb_fdr_list = init_fdr(ds_config,
                                  [MolecularDB(0, 'test_db', 'version')])

        df = collect_ion_formulas(spark_context, moldb_fdr_list)

        assert df.columns.tolist() == [
            'moldb_id', 'ion_formula', 'formula', 'modifier'
        ]
        assert df.shape == (62, 4)
コード例 #6
0
    def test_neutral_losses_and_chem_mods(self, fetch_formulas_mock,
                                          spark_context):
        ds_config = {
            'analysis_version': 1,
            'fdr': {
                'decoy_sample_size': 1
            },
            'isotope_generation': FULL_ISOTOPE_GENERATION_CONFIG,
        }
        moldb_fdr_list = init_fdr(ds_config,
                                  [MolecularDB(0, 'test_db', 'version')])

        df = collect_ion_formulas(spark_context, moldb_fdr_list)

        assert df.columns.tolist() == [
            'moldb_id', 'ion_formula', 'formula', 'modifier'
        ]
        # 2 formulas * (4 target adducts + (4 target adducts * 1 decoy adducts per target adduct)
        # * (no loss + 2 neutral losses) * (no mod + 1 chem mod) = 2 * (4 + 4) * 3 * 2 = 96
        assert df.shape == (96, 4)
コード例 #7
0
def test_ambiguous_modifiers(
    fetch_formulas_mock, formula_image_metrics_mock, spark_context, ds_config
):
    with TemporaryDirectory() as tmpdir:
        ds_data_path = Path(tmpdir)
        print(ds_data_path)

        ds_config = {
            **ds_config,
            "isotope_generation": {
                **ds_config["isotope_generation"],
                # This set of modifiers are deliberately chosen so that ('','-H2O','+H') and ('-H2O+H','','') produce the same
                # modifier string, to test that no code accidentally relies on "modifier" or "ion" strings being unambiguous
                "chem_mods": ["-H2O+H"],
                "neutral_losses": ["-H2O"],
                "adducts": ["+H", "[M]+"],
            },
        }

        formulas = [
            'H3O',
            'H4O',
            'H5O2',
            'H6O2',
        ]  # Formulae selected to create isomers with the above modifiers
        fetch_formulas_mock.return_value = formulas
        msm_search = MSMSearch(
            spark_context,
            make_imzml_reader_mock(),
            [MolecularDB(0, 'test_db', 'version', targeted=True)],
            ds_config,
            ds_data_path,
            NullProfiler(),
        )
        msm_search._fetch_formula_centroids = make_fetch_formula_centroids_mock()
        msm_search.process_segments = lambda centr_segm_n, func: spark_context.parallelize(
            map(func, range(centr_segm_n))
        )
        formula_image_metrics_mock.side_effect = make_formula_image_metrics_mock_side_effect()

        moldb_ion_metrics_df, _, fdr_bundle = next(msm_search.search())
        assert (
            moldb_ion_metrics_df[['formula', 'chem_mod', 'neutral_loss', 'adduct']]
            .duplicated()
            .sum()
            == 0
        )
        # There are 4 combinations of modifiers to get H2: (H3O,-H2O+H,,), (H3O,,-H2O,+H), (H4O,,-H2O,), (H5O2,-H2O+H,-H2O,)
        assert len(moldb_ion_metrics_df[moldb_ion_metrics_df.ion_formula == 'H2']) == 4
        # Only 1 combination of modifiers can create H7O2: (H6O2,,,+H)
        assert len(moldb_ion_metrics_df[moldb_ion_metrics_df.ion_formula == 'H7O2']) == 1

        # H5O2 and H6O2 can have all combinations: 2 neutral loss options, 2 chem mods, 2 adducts = 8 possible combinations
        assert len(moldb_ion_metrics_df[moldb_ion_metrics_df.formula == 'H5O2']) == 8
        assert len(moldb_ion_metrics_df[moldb_ion_metrics_df.formula == 'H6O2']) == 8
        # H3O and H4O cannot simultaneously have -H2O and -H2O+H
        assert len(moldb_ion_metrics_df[moldb_ion_metrics_df.formula == 'H3O']) == 6
        assert len(moldb_ion_metrics_df[moldb_ion_metrics_df.formula == 'H4O']) == 6

        assert moldb_ion_metrics_df.formula.isin(formulas).all()

        sanity_check_fdr_diagnostics(fdr_bundle)
コード例 #8
0
def test_delete_ds__completely(sm_config, test_db, es, sm_index):
    moldb = MolecularDB(0, 'HMDB', '2016')
    moldb2 = MolecularDB(1, 'ChEBI', '2016')

    index = sm_config['elasticsearch']['index']
    es.create(
        index=index,
        doc_type='annotation',
        id='id1',
        body={
            'ds_id': 'dataset1',
            'db_id': moldb.id,
            'db_name': moldb.name,
            'db_version': moldb.version,
        },
    )
    es.create(
        index=index,
        doc_type='annotation',
        id='id2',
        body={
            'ds_id': 'dataset1',
            'db_id': moldb2.id,
            'db_name': moldb2.name,
            'db_version': moldb2.version,
        },
    )
    es.create(
        index=index,
        doc_type='annotation',
        id='id3',
        body={
            'ds_id': 'dataset2',
            'db_id': moldb.id,
            'db_name': moldb.name,
            'db_version': moldb.version,
        },
    )
    es.create(
        index=index,
        doc_type='dataset',
        id='dataset1',
        body={
            'ds_id': 'dataset1',
            'db_id': moldb.id,
            'db_name': moldb.name,
            'db_version': moldb.version,
        },
    )

    wait_for_es(es, index)

    db_mock = MagicMock(spec=DB)

    es_exporter = ESExporter(db_mock, sm_config)
    es_exporter.delete_ds(ds_id='dataset1')

    wait_for_es(es, index)

    body = {'query': {'bool': {'filter': []}}}
    body['query']['bool']['filter'] = [
        {
            'term': {
                'ds_id': 'dataset1'
            }
        },
        {
            'term': {
                'db_id': moldb.id
            }
        },
    ]
    assert es.count(index=index, doc_type='annotation',
                    body=body)['count'] == 0
    body['query']['bool']['filter'] = [
        {
            'term': {
                'ds_id': 'dataset1'
            }
        },
        {
            'term': {
                'db_id': moldb2.id
            }
        },
    ]
    assert es.count(index=index, doc_type='annotation',
                    body=body)['count'] == 0
    body['query']['bool']['filter'] = [
        {
            'term': {
                'ds_id': 'dataset2'
            }
        },
        {
            'term': {
                'db_id': moldb.id
            }
        },
    ]
    assert es.count(index=index, doc_type='annotation',
                    body=body)['count'] == 1
    body['query']['bool']['filter'] = [
        {
            'term': {
                'ds_id': 'dataset1'
            }
        },
        {
            'term': {
                '_type': 'dataset'
            }
        },
    ]
    assert es.count(index=index, doc_type='dataset', body=body)['count'] == 0