コード例 #1
0
    def test_add_optical_image(self, fill_db, sm_config, ds_config):
        db = DB(sm_config['db'])
        action_queue_mock = MagicMock(spec=QueuePublisher)
        es_mock = MagicMock(spec=ESExporter)
        img_store_mock = MagicMock(ImageStoreServiceWrapper)
        img_store_mock.post_image.side_effect = [
            'opt_img_id1', 'opt_img_id2', 'opt_img_id3', 'thumbnail_id'
        ]
        img_store_mock.get_image_by_id.return_value = Image.new(
            'RGB', (100, 100))

        ds_man = create_api_ds_man(sm_config=sm_config,
                                   db=db,
                                   es=es_mock,
                                   img_store=img_store_mock,
                                   annot_queue=action_queue_mock)
        ds_man._annotation_image_shape = MagicMock(return_value=(100, 100))

        ds_id = '2000-01-01'
        ds = create_ds(ds_id=ds_id, ds_config=ds_config)

        zoom_levels = [1, 2, 3]
        raw_img_id = 'raw_opt_img_id'
        ds_man.add_optical_image(ds,
                                 raw_img_id, [[1, 0, 0], [0, 1, 0], [0, 0, 1]],
                                 zoom_levels=zoom_levels)
        assert db.select('SELECT * FROM optical_image') == [
            ('opt_img_id{}'.format(i + 1), ds.id, zoom)
            for i, zoom in enumerate(zoom_levels)
        ]
        assert db.select('SELECT optical_image FROM dataset where id = %s',
                         params=(ds_id, )) == [(raw_img_id, )]
        assert db.select('SELECT thumbnail FROM dataset where id = %s',
                         params=(ds_id, )) == [('thumbnail_id', )]
コード例 #2
0
def run_off_sample(sm_config, ds_ids_str, sql_where, fix_missing, overwrite_existing):
    db = DB()

    ds_ids = None
    if ds_ids_str:
        ds_ids = ds_ids_str.split(',')
    elif sql_where:
        ds_ids = [
            id for (id,) in db.select(f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}')
        ]
    elif fix_missing:
        logger.info('Checking for missing off-sample jobs...')
        results = db.select(MISSING_OFF_SAMPLE_SEL)
        ds_ids = [ds_id for ds_id, in results]
        logger.info(f'Found {len(ds_ids)} missing off-sample sets')

    if not ds_ids:
        logger.warning('No datasets match filter')
        return

    es_exp = ESExporter(db, sm_config)
    for i, ds_id in enumerate(ds_ids):
        try:
            logger.info(f'Running off-sample on {i+1} out of {len(ds_ids)}')
            ds = Dataset.load(db, ds_id)
            classify_dataset_ion_images(db, ds, sm_config['services'], overwrite_existing)
            es_exp.reindex_ds(ds_id)
        except Exception:
            logger.error(f'Failed to run off-sample on {ds_id}', exc_info=True)
コード例 #3
0
def test_search_job_imzml_example(get_compute_img_measures_mock, filter_sf_metrics_mock, create_fill_sm_database, sm_config):
    get_compute_img_measures_mock.return_value = lambda *args: (0.9, 0.9, 0.9)
    filter_sf_metrics_mock.side_effect = lambda x: x

    SMConfig._config_dict = sm_config

    db = DB(sm_config['db'])
    try:
        job = SearchJob(None, 'imzml_example_ds')
        job.run(input_dir_path, ds_config_path, clean=True)

        # dataset meta asserts
        rows = db.select("SELECT name, file_path, img_bounds from dataset")
        img_bounds = {u'y': {u'max': 3, u'min': 1}, u'x': {u'max': 3, u'min': 1}}
        file_path = join(dirname(__file__), 'data', 'imzml_example_ds')
        assert len(rows) == 1
        assert rows[0] == (test_ds_name, file_path, img_bounds)

        # theoretical patterns asserts
        rows = db.select('SELECT db_id, sf_id, adduct, centr_mzs, centr_ints, prof_mzs, prof_ints '
                         'FROM theor_peaks '
                         'ORDER BY adduct')

        assert len(rows) == 3 + len(DECOY_ADDUCTS)
        for r in rows:
            assert r[3] and r[4]

        # image metrics asserts
        rows = db.select(('SELECT db_id, sf_id, adduct, peaks_n, stats FROM iso_image_metrics '
                          'ORDER BY sf_id, adduct'))

        assert rows
        assert rows[0]
        assert tuple(rows[0][:2]) == (0, 10007)
        assert set(rows[0][4].keys()) == {'chaos', 'spatial', 'spectral'}

        # image asserts
        rows = db.select(('SELECT db_id, sf_id, adduct, peak, intensities, min_int, max_int '
                          'FROM iso_image '
                          'ORDER BY sf_id, adduct'))
        assert rows

        max_int = 0.0
        for r in rows:
            max_int = max(max_int, r[-1])
            assert tuple(r[:2]) == (0, 10007)
        assert max_int

    finally:
        db.close()
        with warn_only():
            local('rm -rf {}'.format(data_dir_path))
コード例 #4
0
def test_search_job_imzml_example(get_compute_img_measures_mock, create_fill_sm_database, sm_config):
    get_compute_img_measures_mock.return_value = lambda *args: (0.9, 0.9, 0.9)

    SMConfig._config_dict = sm_config

    db = DB(sm_config['db'])
    try:
        job = SearchJob(None, 'imzml_example_ds')
        job.run(input_dir_path, ds_config_path, clean=True)

        # dataset meta asserts
        rows = db.select("SELECT name, file_path, img_bounds from dataset")
        img_bounds = {u'y': {u'max': 3, u'min': 1}, u'x': {u'max': 3, u'min': 1}}
        file_path = 'file://' + join(data_dir_path, 'ds.txt')
        assert len(rows) == 1
        assert rows[0] == (test_ds_name, file_path, img_bounds)

        # theoretical patterns asserts
        rows = db.select('SELECT db_id, sf_id, adduct, centr_mzs, centr_ints, prof_mzs, prof_ints '
                         'FROM theor_peaks '
                         'ORDER BY adduct')

        assert len(rows) == 3 + len(DECOY_ADDUCTS)
        for r in rows:
            assert r[3] and r[4]

        # image metrics asserts
        rows = db.select(('SELECT db_id, sf_id, adduct, peaks_n, stats FROM iso_image_metrics '
                          'ORDER BY sf_id, adduct'))

        assert rows
        assert rows[0]
        assert tuple(rows[0][:2]) == (0, 10007)
        assert set(rows[0][4].keys()) == {'chaos', 'spatial', 'spectral'}

        # image asserts
        rows = db.select(('SELECT db_id, sf_id, adduct, peak, intensities, min_int, max_int '
                          'FROM iso_image '
                          'ORDER BY sf_id, adduct'))
        assert rows

        max_int = 0.0
        for r in rows:
            max_int = max(max_int, r[-1])
            assert tuple(r[:2]) == (0, 10007)
        assert max_int

    finally:
        db.close()
        with warn_only():
            local('rm -rf {}'.format(data_dir_path))
コード例 #5
0
def run(sm_config, ds_id_str, sql_where, algorithm, use_lithops):
    db = DB()

    if sql_where:
        ds_ids = [
            id for (id, ) in db.select(
                f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}')
        ]
    else:
        ds_ids = ds_id_str.split(',')

    if not ds_ids:
        logger.warning('No datasets match filter')
        return

    if use_lithops:
        executor = Executor(sm_config['lithops'])

    for i, ds_id in enumerate(ds_ids):
        try:
            logger.info(
                f'[{i+1} / {len(ds_ids)}] Generating ion thumbnail for {ds_id}'
            )
            ds = Dataset.load(db, ds_id)
            if use_lithops:
                # noinspection PyUnboundLocalVariable
                generate_ion_thumbnail_lithops(executor,
                                               db,
                                               ds,
                                               algorithm=algorithm)
            else:
                generate_ion_thumbnail(db, ds, algorithm=algorithm)
        except Exception:
            logger.error(f'Failed on {ds_id}', exc_info=True)
コード例 #6
0
def reindex_results(sm_config, ds_id, ds_mask, use_inactive_index,
                    offline_reindex, update_fields):
    assert ds_id or ds_mask or offline_reindex

    IsocalcWrapper.set_centroids_cache_enabled(True)

    if offline_reindex:
        _reindex_all(sm_config)
    else:
        es_config = sm_config['elasticsearch']
        if use_inactive_index:
            es_config = get_inactive_index_es_config(es_config)

        db = DB()
        es_exp = ESExporter(db,
                            sm_config={
                                **sm_config, 'elasticsearch': es_config
                            })

        if ds_id:
            ds_ids = ds_id.split(',')
        elif ds_mask:
            ds_ids = [
                id for (id, ) in db.select(
                    "select id from dataset where name like '{}%'".format(
                        ds_mask))
            ]
        else:
            ds_ids = []

        if update_fields:
            _partial_update_datasets(ds_ids, es_exp, update_fields.split(','))
        else:
            _reindex_datasets(ds_ids, es_exp)
コード例 #7
0
class SciTester(object):
    def __init__(self, db_config):
        self.db = DB(db_config)
        self.base_search_res_path = join(proj_root(), 'tests/reports',
                                         'spheroid_12h_search_res.csv')
        self.metrics = ['chaos', 'spatial', 'spectral']

    def metr_dict_to_array(self, metr_d):
        return np.array([metr_d[m] for m in self.metrics])

    def read_base_search_res(self):
        with open(self.base_search_res_path) as f:
            rows = map(lambda line: line.strip('\n').split('\t'),
                       f.readlines()[1:])
            return {(r[0], r[1]): np.array(r[2:], dtype=float) for r in rows}

    def fetch_search_res(self):
        rows = self.db.select(SEARCH_RES_SELECT, ds_name, 'HMDB')
        return {(r[0], r[1]): self.metr_dict_to_array(r[2]) for r in rows}

    def run_sci_test(self):
        compare_search_results(self.read_base_search_res(),
                               self.fetch_search_res())

    def save_sci_test_report(self):
        with open(self.base_search_res_path, 'w') as f:
            f.write('\t'.join(['sf', 'adduct'] + self.metrics) + '\n')
            for (sf, adduct), metrics in sorted(
                    self.fetch_search_res().iteritems()):
                f.write('\t'.join([sf, adduct] +
                                  metrics.astype(str).tolist()) + '\n')

        print 'Successfully saved sample dataset search report'
コード例 #8
0
class SciTester(object):

    def __init__(self, db_config):
        self.db = DB(db_config)
        self.base_search_res_path = join(proj_root(), 'tests/reports', 'spheroid_12h_search_res.csv')
        self.metrics = ['chaos', 'spatial', 'spectral']

    def metr_dict_to_array(self, metr_d):
        return np.array([metr_d[m] for m in self.metrics])

    def read_base_search_res(self):
        with open(self.base_search_res_path) as f:
            rows = map(lambda line: line.strip('\n').split('\t'), f.readlines()[1:])
            return {(r[0], r[1]): np.array(r[2:], dtype=float) for r in rows}

    def fetch_search_res(self):
        rows = self.db.select(SEARCH_RES_SELECT, ds_name, 'HMDB')
        return {(r[0], r[1]): self.metr_dict_to_array(r[2]) for r in rows}

    def run_sci_test(self):
        compare_search_results(self.read_base_search_res(), self.fetch_search_res())

    def save_sci_test_report(self):
        with open(self.base_search_res_path, 'w') as f:
            f.write('\t'.join(['sf', 'adduct'] + self.metrics) + '\n')
            for (sf, adduct), metrics in sorted(self.fetch_search_res().iteritems()):
                f.write('\t'.join([sf, adduct] + metrics.astype(str).tolist()) + '\n')

        print 'Successfully saved sample dataset search report'
コード例 #9
0
def test_save_sf_iso_images_correct_db_call(spark_context,
                                            create_fill_sm_database, sm_config,
                                            ds_config):
    sf_iso_imgs = spark_context.parallelize([((1, '+H'), [
        csr_matrix([[100, 0, 0], [0, 0, 0]]),
        csr_matrix([[0, 0, 0], [0, 0, 10]])
    ])])
    sf_adduct_peaksn = [(1, '+H', 2)]
    res = SearchResults(0, 0, 0, 'ds_name', sf_adduct_peaksn, db_mock,
                        sm_config, ds_config)
    res.sf_iso_images = sf_iso_imgs
    res.nrows, res.ncols = 2, 3
    res.store_sf_iso_images()

    correct_rows = [(0, 0, 1, '+H', 0, [0], [100], 0, 100),
                    (0, 0, 1, '+H', 1, [5], [10], 0, 10)]

    db = DB(sm_config['db'])
    try:
        rows = db.select((
            'SELECT job_id, db_id, sf_id, adduct, peak, pixel_inds, intensities, min_int, max_int '
            'FROM iso_image '
            'ORDER BY sf_id, adduct'))
        assert correct_rows == rows
    finally:
        db.close()
コード例 #10
0
def migrate_optical_images(ds_id):
    output.print('Migrating optical images')

    with timeit():
        output.print('Transferring images and updating database...')
        db = DB()
        rows = db.select(SEL_OPTICAL_IMGS, params=(ds_id,))
        for opt_image_id, opt_image_url in rows:
            if not opt_image_url and opt_image_id:
                transfer_images(
                    ds_id,
                    'optical_images',
                    image_storage.OPTICAL,
                    [opt_image_id],
                )
                opt_image_url = image_storage.get_image_url(
                    image_storage.OPTICAL, ds_id, opt_image_id
                )
                db.alter(UPD_OPTICAL_IMGS, params=(opt_image_url, opt_image_id))

        opt_thumb_id, opt_thumb_url = db.select_one(SEL_OPT_THUMB, params=(ds_id,))
        if not opt_thumb_url and opt_thumb_id:
            transfer_images(
                ds_id,
                'optical_images',
                image_storage.OPTICAL,
                [opt_thumb_id],
            )
            opt_thumb_url = image_storage.get_image_url(image_storage.OPTICAL, ds_id, opt_thumb_id)
            db.alter(UPD_OPT_THUMB, params=(opt_thumb_url, ds_id))
コード例 #11
0
def update_optical_images(ds_id_str, sql_where):
    db = DB()

    if ds_id_str:
        ds_ids = ds_id_str.split(',')
    else:
        ds_ids = [
            id for (id, ) in db.select(
                f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}')
        ]

    for i, ds_id in enumerate(ds_ids):
        try:
            transform, img_id = db.select_one(
                'SELECT transform, optical_image from dataset WHERE id = %s',
                params=(ds_id, ))
            if img_id and transform:
                logger.info(
                    f'[{i + 1}/{len(ds_ids)}] Updating optical image of dataset {ds_id}'
                )
                add_optical_image(db, ds_id, img_id, transform)
            else:
                logger.info(
                    f'[{i + 1}/{len(ds_ids)}] Skipping dataset {ds_id}')
        except Exception:
            logger.error(f'Failed to update optical image on {ds_id}',
                         exc_info=True)
コード例 #12
0
def test_new_ds_saves_to_db(test_db, metadata, ds_config):
    db = DB()
    moldb = create_test_molecular_db()
    ds_config['database_ids'] = [moldb.id]
    ds = create_test_ds(config={**ds_config, 'database_ids': [moldb.id]})

    ion_metrics_df = pd.DataFrame({
        'formula': ['H2O', 'H2O', 'CO2', 'CO2', 'H2SO4', 'H2SO4'],
        'adduct': ['+H', '[M]+', '+H', '[M]+', '+H', '[M]+'],
        'fdr': [0.05, 0.1, 0.05, 0.1, 0.05, 0.1],
        'image_id':
        list(map(str, range(6))),
    })
    (job_id, ) = db.insert_return(
        "INSERT INTO job (moldb_id, ds_id, status) VALUES (%s, %s, 'FINISHED') RETURNING id",
        rows=[(moldb.id, ds.id)],
    )
    db.insert(
        'INSERT INTO annotation('
        '   job_id, formula, chem_mod, neutral_loss, adduct, msm, fdr, stats, iso_image_ids'
        ') '
        "VALUES (%s, %s, '', '', %s, 1, %s, '{}', %s)",
        [(job_id, r.formula, r.adduct, r.fdr, [r.image_id])
         for i, r in ion_metrics_df.iterrows()],
    )

    with patch(
            'sm.engine.postprocessing.colocalization.ImageStorage.get_ion_images_for_analysis'
    ) as get_ion_images_for_analysis_mock:
        get_ion_images_for_analysis_mock.side_effect = mock_get_ion_images_for_analysis

        Colocalization(db).run_coloc_job(ds)

    jobs = db.select('SELECT id, error, sample_ion_ids FROM graphql.coloc_job')
    annotations = db.select(
        'SELECT coloc_ion_ids, coloc_coeffs FROM graphql.coloc_annotation')
    ions = db.select('SELECT id FROM graphql.ion')

    assert len(jobs) > 0
    assert not any(job[1] for job in jobs)
    assert jobs[0][2]
    assert len(annotations) > 10
    assert all(len(ann[0]) == len(ann[1]) for ann in annotations)
    assert len(ions) == len(ion_metrics_df)
コード例 #13
0
def reindex_all_results(conf):
    db = DB(conf['db'])
    es_exp = ESExporter(conf)

    es_exp.delete_index(name='sm')
    es_exp.create_index(name='sm')

    ds_db_pairs = db.select("select name, config -> 'database'::text -> 'name'::text from dataset")

    for ds_name, db_name in ds_db_pairs:
        es_exp.index_ds(db, ds_name, db_name)
コード例 #14
0
def reindex_all_results(conf):
    db = DB(conf['db'])
    es_exp = ESExporter(conf)

    es_exp.delete_index(name='sm')
    es_exp.create_index(name='sm')

    ds_db_pairs = db.select(
        "select name, config -> 'database'::text -> 'name'::text from dataset")

    for ds_name, db_name in ds_db_pairs:
        es_exp.index_ds(db, ds_name, db_name)
コード例 #15
0
def test_theor_peaks_generator_run_failed_iso_peaks(create_fill_test_db, spark_context, sm_config, ds_config):
    ds_config["isotope_generation"]["adducts"] = ["+Na"]
    theor_peaks_gen = TheorPeaksGenerator(spark_context, sm_config, ds_config)
    theor_peaks_gen.isocalc_wrapper.isotope_peaks = lambda *args: Centroids([], [])
    theor_peaks_gen.run()

    db = DB(sm_config['db'])
    rows = db.select('SELECT * FROM theor_peaks')

    assert len(rows) == 1

    db.close()
コード例 #16
0
def reindex_results(ds_id, ds_mask):
    assert ds_id or ds_mask

    conf = SMConfig.get_conf()
    if ds_mask == '_all_':
        _reindex_all(conf)
    else:
        db = DB(conf['db'])
        es_exp = ESExporter(db)

        if ds_id:
            rows = db.select(
                "select id, name, config from dataset where id = '{}'".format(
                    ds_id))
        elif ds_mask:
            rows = db.select(
                "select id, name, config from dataset where name like '{}%'".
                format(ds_mask))
        else:
            rows = []

        _reindex_datasets(rows, es_exp)
コード例 #17
0
def test_add_optical_image(image_storage_mock, requests_mock, fill_db, metadata, ds_config):
    image_ids = [
        'opt_img_scaled_id1',
        'opt_img_id1',
        'opt_img_scaled_id2',
        'opt_img_id2',
        'opt_img_scaled_id3',
        'opt_img_id3',
        'thumbnail_id',
    ]
    image_storage_mock.post_image.side_effect = image_ids
    image_storage_mock.get_image_url.return_value = [f'http://{img_id}' for img_id in image_ids]
    image_storage_mock.get_image.return_value = create_image_bytes()

    requests_mock.get.return_value = mock.Mock(content=create_image_bytes())

    db = DB()
    ds = create_test_ds()

    zoom_levels = [1, 2, 3]
    raw_img_id = 'raw_opt_img_id'
    add_optical_image(
        db, ds.id, raw_img_id, [[1, 0, 0], [0, 1, 0], [0, 0, 1]], zoom_levels=zoom_levels
    )

    optical_images = db.select(f"SELECT ds_id, type, zoom FROM optical_image")
    for type, zoom in itertools.product(
        [OpticalImageType.SCALED, OpticalImageType.CLIPPED_TO_ION_IMAGE], zoom_levels
    ):
        assert (ds.id, type, zoom) in optical_images

    assert db.select('SELECT optical_image FROM dataset where id = %s', params=(ds.id,)) == [
        (raw_img_id,)
    ]
    assert db.select('SELECT thumbnail FROM dataset where id = %s', params=(ds.id,)) == [
        ('thumbnail_id',)
    ]
コード例 #18
0
def test_theor_peaks_generator_run_1(create_fill_test_db, spark_context, sm_config, ds_config):
    ds_config["isotope_generation"]["adducts"] = ["+H", "+Na"]
    theor_peaks_gen = TheorPeaksGenerator(spark_context, sm_config, ds_config)
    theor_peaks_gen.isocalc_wrapper.isotope_peaks = lambda *args: Centroids([100., 200.], [10., 1.])
    theor_peaks_gen.run()

    db = DB(sm_config['db'])
    rows = db.select(('SELECT db_id, sf_id, adduct, sigma, charge, pts_per_mz, centr_mzs, '
                      'centr_ints, prof_mzs, prof_ints FROM theor_peaks ORDER BY sf_id, adduct'))

    assert len(rows) == 2 + 80
    assert (filter(lambda r: r[2] == '+H', rows)[0] ==
            (0, 9, '+H', 0.01, 1, 10000, [100., 200.], [10., 1.], [], []))
    assert (filter(lambda r: r[2] == '+Na', rows)[0] ==
            (0, 9, '+Na', 0.01, 1, 10000, [100., 200.], [10., 1.], [], []))

    db.close()
コード例 #19
0
def find_dataset_ids(ds_ids_param, sql_where, missing, failed, succeeded):
    db = DB()

    if ds_ids_param:
        specified_ds_ids = ds_ids_param.split(',')
    elif sql_where:
        specified_ds_ids = db.select_onecol(
            f"SELECT id FROM dataset WHERE {sql_where}")
    else:
        specified_ds_ids = None
    if not missing:
        # Default to processing all datasets missing diagnostics
        missing = specified_ds_ids is None and not failed and not succeeded
    ds_type_counts = db.select(
        'SELECT d.id, COUNT(DISTINCT dd.type), COUNT(dd.error) '
        'FROM dataset d LEFT JOIN dataset_diagnostic dd on d.id = dd.ds_id '
        'WHERE d.status = \'FINISHED\' '
        'GROUP BY d.id')
    if missing or failed or succeeded:
        # Get ds_ids based on status (or filter specified ds_ids on status)
        status_ds_ids = set()
        for ds_id, n_diagnostics, n_errors in ds_type_counts:
            if missing and (n_diagnostics or 0) < len(DiagnosticType):
                status_ds_ids.add(ds_id)
            elif failed and n_errors > 0:
                status_ds_ids.add(ds_id)
            elif succeeded and n_diagnostics == len(
                    DiagnosticType) and n_errors == 0:
                status_ds_ids.add(ds_id)

        if specified_ds_ids is not None:
            # Keep order, if directly specified
            ds_ids = [
                ds_id for ds_id in specified_ds_ids if ds_id in status_ds_ids
            ]
        else:
            # Order by ID descending, so that newer DSs are updated first
            ds_ids = sorted(status_ds_ids, reverse=True)
    else:
        ds_ids = specified_ds_ids
    assert ds_ids, 'No datasets found'
    return ds_ids
コード例 #20
0
def _reindex_all(conf):
    es_config = conf['elasticsearch']
    alias = es_config['index']
    es_man = ESIndexManager(es_config)
    new_index = es_man.another_index_name(es_man.internal_index_name(alias))
    es_man.create_index(new_index)

    try:
        tmp_es_config = deepcopy(es_config)
        tmp_es_config['index'] = new_index

        db = DB(conf['db'])
        es_exp = ESExporter(db, tmp_es_config)
        rows = db.select('select id, name, config from dataset')
        _reindex_datasets(rows, es_exp)

        es_man.remap_alias(tmp_es_config['index'], alias=alias)
    except Exception as e:
        es_man.delete_index(new_index)
        raise e
コード例 #21
0
def run(ds_id, sql_where):

    conf = SMConfig.get_conf()

    db = DB(conf['db'])
    img_store = ImageStoreServiceWrapper(conf['services']['img_service_url'])

    if sql_where:
        ds_ids = [
            id for (id,) in db.select(f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}')
        ]
    else:
        ds_ids = ds_id.split(',')

    if not ds_ids:
        logger.warning('No datasets match filter')
        return

    for i, ds_id in enumerate(ds_ids):
        try:
            logger.info(f'[{i+1} / {len(ds_ids)}] Updating acq geometry for {ds_id}')
            ds = Dataset.load(db, ds_id)
            (sample_img_id,) = db.select_one(
                "SELECT iim.iso_image_ids[1] from job j "
                "JOIN iso_image_metrics iim on j.id = iim.job_id "
                "WHERE j.ds_id = %s LIMIT 1",
                [ds_id],
            )
            print(sample_img_id)
            if sample_img_id:
                w, h = img_store.get_image_by_id('fs', 'iso_image', sample_img_id).size
                dims = (h, w)  # n_cols, n_rows
            else:
                dims = (None, None)

            acq_geometry = make_acq_geometry('ims', None, ds.metadata, dims)

            ds.save_acq_geometry(db, acq_geometry)

        except Exception:
            logger.error(f'Failed on {ds_id}', exc_info=True)
コード例 #22
0
def update_public_database_descriptions():
    db = DB()
    public_db_names = db.select(
        'SELECT name FROM molecular_db WHERE is_public = true AND archived = false'
    )
    logger.info(f'Updating public molecular databases: {public_db_names}')

    for (name, ) in public_db_names:
        desc = database_descriptions.get(name, None)
        if desc:
            db.alter(
                "UPDATE molecular_db "
                "SET description = %s, full_name = %s, link = %s, citation = %s "
                "WHERE name = %s;",
                params=(
                    desc['description'],
                    desc['full_name'],
                    desc['link'],
                    desc['citation'],
                    name,
                ),
            )
コード例 #23
0
def _reindex_all(sm_config):
    es_config = sm_config['elasticsearch']
    alias = es_config['index']
    es_man = ESIndexManager(es_config)
    old_index = es_man.internal_index_name(alias)
    new_index = es_man.another_index_name(old_index)
    es_man.create_index(new_index)

    try:
        inactive_es_config = get_inactive_index_es_config(es_config)
        db = DB()
        es_exp = ESExporter(db, {
            **sm_config, 'elasticsearch': inactive_es_config
        })
        ds_ids = [r[0] for r in db.select('select id from dataset')]
        _reindex_datasets(ds_ids, es_exp)

        es_man.remap_alias(inactive_es_config['index'], alias=alias)
    except Exception as e:
        es_man.delete_index(new_index)
        raise e
    else:
        es_man.delete_index(old_index)
コード例 #24
0
ファイル: spheroid.py プロジェクト: metaspace2020/metaspace
def ensure_db_populated(sm_config, analysis_version, database):
    db = DB()
    # Install DB schema if needed
    query = "SELECT COUNT(*) FROM pg_tables WHERE schemaname = 'public' AND tablename = 'dataset'"
    tables_exist = db.select_one(query)[0] >= 1
    if not tables_exist:
        print('Installing DB schema')
        db.alter(DB_SQL_SCHEMA)

    # Import HMDB if needed
    moldb = MOL_DBS[database]
    try:
        molecular_db.find_by_name_version(moldb['name'], moldb['version'])
    except SMError:
        print(f'Importing {database}')
        with TemporaryDirectory() as tmp:
            urlretrieve(moldb['url'], f'{tmp}/moldb.tsv')
            molecular_db.create(moldb['name'], moldb['version'],
                                f'{tmp}/moldb.tsv')

    if analysis_version > 1:
        if len(
                db.select(
                    "SELECT name FROM scoring_model WHERE name = 'v3_default'")
        ) == 0:
            print("Importing v3_default scoring model")
            params = upload_catboost_scoring_model(
                model=Path(proj_root()) /
                '../scoring-models/v3_default/model-2022-01-05T13-45-26.947188-416b1311.cbm',
                bucket=sm_config['lithops']['lithops']['storage_bucket'],
                prefix=f'test_scoring_models/v3_default',
                is_public=False,
            )
            save_scoring_model_to_db(name='v3_default',
                                     type_='catboost',
                                     params=params)
def test_search_job_imzml_example(get_compute_img_metrics_mock, filter_sf_metrics_mock,
                                  post_images_to_annot_service_mock, MolDBServiceWrapperMock, MolDBServiceWrapperMock2,
                                  sm_config, create_fill_sm_database, es_dsl_search, clean_isotope_storage):
    init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock)
    init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock2)

    get_compute_img_metrics_mock.return_value = lambda *args: (0.9, 0.9, 0.9, [100.], [0], [10.])
    filter_sf_metrics_mock.side_effect = lambda x: x

    url_dict = {
        'iso_image_ids': ['iso_image_1', None, None, None]
    }
    post_images_to_annot_service_mock.return_value = {
        35: url_dict,
        44: url_dict
    }

    db = DB(sm_config['db'])

    try:
        ds_config_str = open(ds_config_path).read()
        upload_dt = datetime.now()
        ds_id = '2000-01-01_00h00m'
        db.insert(Dataset.DS_INSERT, [{
            'id': ds_id,
            'name': test_ds_name,
            'input_path': input_dir_path,
            'upload_dt': upload_dt,
            'metadata': '{}',
            'config': ds_config_str,
            'status': DatasetStatus.QUEUED,
            'is_public': True,
            'mol_dbs': ['HMDB-v4'],
            'adducts': ['+H', '+Na', '+K'],
            'ion_img_storage': 'fs'
        }])

        img_store = ImageStoreServiceWrapper(sm_config['services']['img_service_url'])
        job = SearchJob(img_store=img_store)
        job._sm_config['rabbitmq'] = {}  # avoid talking to RabbitMQ during the test
        ds = Dataset.load(db, ds_id)
        job.run(ds)

        # dataset table asserts
        rows = db.select('SELECT id, name, input_path, upload_dt, status from dataset')
        input_path = join(dirname(__file__), 'data', test_ds_name)
        assert len(rows) == 1
        assert rows[0] == (ds_id, test_ds_name, input_path, upload_dt, DatasetStatus.FINISHED)

        # ms acquisition geometry asserts
        rows = db.select('SELECT acq_geometry from dataset')
        assert len(rows) == 1
        assert rows[0][0] == ds.get_acq_geometry(db)
        assert rows[0][0] == {
            ACQ_GEOMETRY_KEYS.LENGTH_UNIT: 'nm',
            ACQ_GEOMETRY_KEYS.AcqGridSection.section_name: {
                ACQ_GEOMETRY_KEYS.AcqGridSection.REGULAR_GRID: True,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_X : 3,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_Y : 3,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_X : 100,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_Y : 100
            },
            ACQ_GEOMETRY_KEYS.PixelSizeSection.section_name: {
                ACQ_GEOMETRY_KEYS.PixelSizeSection.REGULAR_SIZE: True,
                ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_X : 100,
                ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_Y : 100
            }
        }

        # job table asserts
        rows = db.select('SELECT db_id, ds_id, status, start, finish from job')
        assert len(rows) == 1
        db_id, ds_id, status, start, finish = rows[0]
        assert (db_id, ds_id, status) == (0, '2000-01-01_00h00m', 'FINISHED')
        assert start < finish

        # image metrics asserts
        rows = db.select(('SELECT db_id, sf, adduct, stats, iso_image_ids '
                          'FROM iso_image_metrics '
                          'ORDER BY sf, adduct'))

        assert rows[0] == (0, 'C12H24O', '+K', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9,
                                                'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]},
                           ['iso_image_1', None, None, None])
        assert rows[1] == (0, 'C12H24O', '+Na', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9,
                                                 'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]},
                           ['iso_image_1', None, None, None])

        time.sleep(1)  # Waiting for ES
        # ES asserts
        ds_docs = es_dsl_search.query('term', _type='dataset').execute().to_dict()['hits']['hits']
        assert 1 == len(ds_docs)
        ann_docs = es_dsl_search.query('term', _type='annotation').execute().to_dict()['hits']['hits']
        assert len(ann_docs) == len(rows)
        for doc in ann_docs:
            assert doc['_id'].startswith(ds_id)

    finally:
        db.close()
        with warn_only():
            local('rm -rf {}'.format(data_dir_path))
コード例 #26
0
#               'WHERE d.name = %s '
#               'ORDER BY t.target_add, t.sf_id')

EXPORT_SEL = ('SELECT adds.sf_id, adds.target_add, f.sf, adds.decoy_add '
              'FROM target_decoy_add adds '
              'JOIN agg_formula f ON f.id = adds.sf_id '
              'JOIN job j ON j.id = adds.job_id '
              'JOIN dataset ds ON ds.id = j.ds_id AND adds.db_id = f.db_id'
              'WHERE ds.name = %s '
              'ORDER BY adds.target_add, adds.sf_id')

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Exporting target/decoy sets into a csv file')
    parser.add_argument('ds_name', type=str, help='Dataset name')
    parser.add_argument('csv_path', type=str, help='Path for the csv file')
    parser.add_argument('--config', dest='sm_config_path', type=str, help='SM config path')
    parser.set_defaults(sm_config_path=path.join(proj_root(), 'conf/config.json'))
    args = parser.parse_args()

    SMConfig.set_path(args.sm_config_path)
    db = DB(SMConfig.get_conf()['db'])

    export_rs = db.select(EXPORT_SEL, args.ds_name)

    header = ','.join(['sf_id', 'target_add', 'sf', 'decoy_add']) + '\n'
    with open(args.csv_path, 'w') as f:
        f.write(header)
        f.writelines([','.join(map(str, row)) + '\n' for row in export_rs])

    logger.info('Exported all search results for "%s" dataset into "%s" file', args.ds_name, args.csv_path)
コード例 #27
0
class SciTester(object):
    def __init__(self, sm_config_path):
        self.sm_config_path = sm_config_path
        self.sm_config = SMConfig.get_conf()
        self.db = DB(self.sm_config['db'])

        self.ds_id = '2000-01-01-00_00_00'
        self.base_search_res_path = join(proj_root(), 'tests/reports',
                                         'spheroid_untreated_search_res.csv')
        self.ds_name = 'sci_test_spheroid_untreated'
        self.data_dir_path = join(self.sm_config['fs']['base_path'],
                                  self.ds_name)
        self.input_path = join(proj_root(), 'tests/data/untreated')
        self.ds_config_path = join(self.input_path, 'config.json')
        self.metrics = ['chaos', 'spatial', 'spectral']

    def metr_dict_to_array(self, metr_d):
        metric_array = np.array([metr_d[m] for m in self.metrics])
        return np.hstack([metric_array, metric_array.prod()])

    def read_base_search_res(self):
        def prep_metric_arrays(a):
            return np.array(a, dtype=float)

        with open(self.base_search_res_path) as f:
            rows = map(lambda line: line.strip('\n').split('\t'),
                       f.readlines()[1:])
            return {(r[0], r[1]): prep_metric_arrays(r[2:]) for r in rows}

    def fetch_search_res(self):
        mol_db_service = MolDBServiceWrapper(
            self.sm_config['services']['mol_db'])
        mol_db_id = mol_db_service.find_db_by_name_version(
            'HMDB-v2.5')[0]['id']
        rows = self.db.select(SEARCH_RES_SELECT,
                              params=(mol_db_id, self.ds_name))
        return {(r[0], r[1]): self.metr_dict_to_array(r[2]) for r in rows}

    def save_sci_test_report(self):
        with open(self.base_search_res_path, 'w') as f:
            f.write('\t'.join(['sf', 'adduct'] + self.metrics) + '\n')
            for (sf,
                 adduct), metrics in sorted(self.fetch_search_res().items()):
                f.write('\t'.join([sf, adduct] +
                                  metrics.astype(str).tolist()) + '\n')

        print('Successfully saved sample dataset search report')

    @staticmethod
    def print_metric_hist(metric_arr, bins=10):
        metric_freq, metric_interv = np.histogram(metric_arr,
                                                  bins=np.linspace(-1, 1, 21))
        metric_interv = [round(x, 2) for x in metric_interv]
        pprint(
            list(zip(zip(metric_interv[:-1], metric_interv[1:]), metric_freq)))

    def report_metric_differences(self, metrics_array):
        metrics_array = np.array(metrics_array)
        print("\nCHAOS HISTOGRAM")
        self.print_metric_hist(metrics_array[:, 0])
        print("\nIMG_CORR HISTOGRAM")
        self.print_metric_hist(metrics_array[:, 1])
        print("\nPAT_MATCH HISTOGRAM")
        self.print_metric_hist(metrics_array[:, 2])
        print("\nMSM HISTOGRAM")
        self.print_metric_hist(metrics_array[:, 3])

    def _missed_formulas(self, old, new):
        missed_sf_adduct = set(old.keys()) - set(new.keys())
        print('MISSED FORMULAS: {:.1f}%'.format(
            len(missed_sf_adduct) / len(old) * 100))
        if missed_sf_adduct:
            missed_sf_base_metrics = np.array(
                [old[k] for k in missed_sf_adduct])
            self.report_metric_differences(missed_sf_base_metrics)
        return bool(missed_sf_adduct)

    def _false_discovery(self, old, new):
        new_sf_adduct = set(new.keys()) - set(old.keys())
        print('\nFALSE DISCOVERY: {:.1f}%'.format(
            len(new_sf_adduct) / len(old) * 100))

        if new_sf_adduct:
            for sf_adduct in new_sf_adduct:
                metrics = new[sf_adduct]
                print('{} metrics = {}'.format(sf_adduct, metrics))
        return bool(new_sf_adduct)

    def _metrics_diff(self, old, new):
        print('\nDIFFERENCE IN METRICS:')
        metric_diffs = []
        for b_sf_add, b_metr in old.items():
            if b_sf_add in new.keys():
                metr = new[b_sf_add]
                diff = b_metr - metr
                if np.any(np.abs(diff) > 1e-6):
                    metric_diffs.append(diff)
                    print('{} metrics diff = {}'.format(b_sf_add, diff))

        if metric_diffs:
            self.report_metric_differences(metric_diffs)
        return bool(metric_diffs)

    def search_results_are_different(self):
        old_search_res = self.read_base_search_res()
        search_res = self.fetch_search_res()
        return (self._missed_formulas(old_search_res, search_res)
                or self._false_discovery(old_search_res, search_res)
                or self._metrics_diff(old_search_res, search_res))

    def _create_img_store_mock(self):
        class ImageStoreMock(object):
            def post_image(self, *args):
                return None

            def delete_image_by_id(self, *args):
                return None

        return ImageStoreMock()

    def run_search(self, mock_img_store=False):
        if mock_img_store:
            img_store = self._create_img_store_mock()
        else:
            img_store = ImageStoreServiceWrapper(
                self.sm_config['services']['img_service_url'])
        manager = SMDaemonManager(db=self.db,
                                  es=ESExporter(self.db),
                                  img_store=img_store)

        ds = create_ds_from_files(self.ds_id, self.ds_name, self.input_path)
        from sm.engine.search_job import SearchJob
        manager.annotate(ds, search_job_factory=SearchJob, del_first=True)

    def clear_data_dirs(self):
        with warn_only():
            local('rm -rf {}'.format(self.data_dir_path))
コード例 #28
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='Exporting search results into a csv file')
    parser.add_argument('ds_name', type=str, help='Dataset name')
    parser.add_argument('csv_path', type=str, help='Path for the csv file')
    parser.add_argument('--config',
                        dest='sm_config_path',
                        type=str,
                        help='SM config path')
    parser.set_defaults(
        sm_config_path=path.join(proj_root(), 'conf/config.json'))
    args = parser.parse_args()

    SMConfig.set_path(args.sm_config_path)
    db = DB(SMConfig.get_conf()['db'])

    ds_config = db.select_one(DS_CONFIG_SEL, args.ds_name)[0]
    isotope_gen_config = ds_config['isotope_generation']
    charge = '{}{}'.format(isotope_gen_config['charge']['polarity'],
                           isotope_gen_config['charge']['n_charges'])
    export_rs = db.select(EXPORT_SEL, ds_config['database']['name'],
                          args.ds_name, isotope_gen_config['isocalc_sigma'],
                          charge, isotope_gen_config['isocalc_pts_per_mz'])

    header = '\t'.join(['formula_db', 'db_ids', 'sf_name', 'sf', 'adduct']) +'\t' + '\t'.join(metrics) + '\t' + \
             '\t'.join(['fdr', 'isocalc_sigma', 'isocalc_charge', 'isocalc_pts_per_mz', 'first_peak_mz']) + '\n'
    with open(args.csv_path, 'w') as f:
        f.write(header)
        f.writelines(['\t'.join(map(str, row)) + '\n' for row in export_rs])
    logger.info('Exported all search results for "%s" dataset into "%s" file',
                args.ds_name, args.csv_path)
コード例 #29
0
class SciTester(object):

    def __init__(self, sm_config_path):
        self.sm_config_path = sm_config_path
        self.sm_config = SMConfig.get_conf()
        self.db = DB(self.sm_config['db'])

        self.ds_id = '2000-01-01-00_00_00'
        self.base_search_res_path = join(proj_root(), 'tests/reports', 'spheroid_untreated_search_res.csv')
        self.ds_name = 'sci_test_spheroid_untreated'
        self.data_dir_path = join(self.sm_config['fs']['base_path'], self.ds_name)
        self.input_path = join(proj_root(), 'tests/data/untreated')
        self.ds_config_path = join(self.input_path, 'config.json')
        self.metrics = ['chaos', 'spatial', 'spectral']

    def metr_dict_to_array(self, metr_d):
        metric_array = np.array([metr_d[m] for m in self.metrics])
        return np.hstack([metric_array, metric_array.prod()])

    def read_base_search_res(self):
        def prep_metric_arrays(a):
            return np.array(a, dtype=float)

        with open(self.base_search_res_path) as f:
            rows = map(lambda line: line.strip('\n').split('\t'), f.readlines()[1:])
            return {(r[0], r[1]):  prep_metric_arrays(r[2:]) for r in rows}

    def fetch_search_res(self):
        mol_db_service = MolDBServiceWrapper(self.sm_config['services']['mol_db'])
        mol_db_id = mol_db_service.find_db_by_name_version('HMDB-v2.5')[0]['id']
        rows = self.db.select(SEARCH_RES_SELECT, params=(mol_db_id, self.ds_name))
        return {(r[0], r[1]): self.metr_dict_to_array(r[2]) for r in rows}

    def save_sci_test_report(self):
        with open(self.base_search_res_path, 'w') as f:
            f.write('\t'.join(['sf', 'adduct'] + self.metrics) + '\n')
            for (sf, adduct), metrics in sorted(self.fetch_search_res().items()):
                f.write('\t'.join([sf, adduct] + metrics.astype(str).tolist()) + '\n')

        print('Successfully saved sample dataset search report')

    @staticmethod
    def print_metric_hist(metric_arr, bins=10):
        metric_freq, metric_interv = np.histogram(metric_arr, bins=np.linspace(-1, 1, 21))
        metric_interv = [round(x, 2) for x in metric_interv]
        pprint(list(zip(zip(metric_interv[:-1], metric_interv[1:]), metric_freq)))

    def report_metric_differences(self, metrics_array):
        metrics_array = np.array(metrics_array)
        print("\nCHAOS HISTOGRAM")
        self.print_metric_hist(metrics_array[:, 0])
        print("\nIMG_CORR HISTOGRAM")
        self.print_metric_hist(metrics_array[:, 1])
        print("\nPAT_MATCH HISTOGRAM")
        self.print_metric_hist(metrics_array[:, 2])
        print("\nMSM HISTOGRAM")
        self.print_metric_hist(metrics_array[:, 3])

    def _missed_formulas(self, old, new):
        missed_sf_adduct = set(old.keys()) - set(new.keys())
        print('MISSED FORMULAS: {:.1f}%'.format(len(missed_sf_adduct) / len(old) * 100))
        if missed_sf_adduct:
            missed_sf_base_metrics = np.array([old[k] for k in missed_sf_adduct])
            self.report_metric_differences(missed_sf_base_metrics)
        return bool(missed_sf_adduct)

    def _false_discovery(self, old, new):
        new_sf_adduct = set(new.keys()) - set(old.keys())
        print('\nFALSE DISCOVERY: {:.1f}%'.format(len(new_sf_adduct) / len(old) * 100))

        if new_sf_adduct:
            for sf_adduct in new_sf_adduct:
                metrics = new[sf_adduct]
                print('{} metrics = {}'.format(sf_adduct, metrics))
        return bool(new_sf_adduct)

    def _metrics_diff(self, old, new):
        print('\nDIFFERENCE IN METRICS:')
        metric_diffs = []
        for b_sf_add, b_metr in old.items():
            if b_sf_add in new.keys():
                metr = new[b_sf_add]
                diff = b_metr - metr
                if np.any(np.abs(diff) > 1e-6):
                    metric_diffs.append(diff)
                    print('{} metrics diff = {}'.format(b_sf_add, diff))

        if metric_diffs:
            self.report_metric_differences(metric_diffs)
        return bool(metric_diffs)

    def search_results_are_different(self):
        old_search_res = self.read_base_search_res()
        search_res = self.fetch_search_res()
        return (self._missed_formulas(old_search_res, search_res) or
                self._false_discovery(old_search_res, search_res) or
                self._metrics_diff(old_search_res, search_res))

    def _create_img_store_mock(self):

        class ImageStoreMock(object):
            def post_image(self, *args):
                return None

            def delete_image_by_id(self, *args):
                return None

        return ImageStoreMock()

    def run_search(self, mock_img_store=False):
        if mock_img_store:
            img_store = self._create_img_store_mock()
        else:
            img_store = ImageStoreServiceWrapper(self.sm_config['services']['img_service_url'])
        ds_man = SMDaemonDatasetManager(db=self.db, es=ESExporter(self.db),
                                        img_store=img_store, mode='local')

        ds = create_ds_from_files(self.ds_id, self.ds_name, self.input_path)
        from sm.engine import SearchJob
        ds_man.add(ds, search_job_factory=SearchJob, del_first=True)

    def clear_data_dirs(self):
        with warn_only():
            local('rm -rf {}'.format(self.data_dir_path))
コード例 #30
0
    "SELECT f.sf, t.adduct, t.centr_mzs, t.centr_ints "
    "FROM public.agg_formula f, public.theor_peaks t "
    "WHERE t.sf_id = f.id AND f.db_id = 1 AND f.sf = %s AND t.adduct = %s "  # hardcoded to always fetch from HMDB, lazy i know
    "ORDER BY t.adduct;")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Exporting isotopic images')
    parser.add_argument('sf', type=str, help='sum formula')
    parser.add_argument('add', type=str, help='adduct')
    parser.add_argument('pkl_path', type=str, help='Path for the cPickle file')
    parser.add_argument('--config',
                        dest='sm_config_path',
                        type=str,
                        help='SM config path')
    parser.set_defaults(
        sm_config_path=path.join(proj_root(), 'conf/config.json'))
    args = parser.parse_args()

    SMConfig.set_path(args.sm_config_path)
    db = DB(SMConfig.get_conf()['db'])

    export_rs = db.select(EXPORT_SEL, args.sf, args.add)

    export_df = pd.DataFrame(
        export_rs, columns=['sf', 'adduct', 'centr_mzs', 'centr_ints'])

    export_df.to_csv(args.pkl_path, index=False)
    logger.info(
        'Exported the spectra for the "%s" sum formula, "%s" adduct into "%s" file',
        args.sf, args.add, args.pkl_path)
コード例 #31
0
class TheorPeaksGenerator(object):
    """ Generator of theoretical isotope peaks for all molecules in a database.

    Args
    ----------
    sc : pyspark.SparkContext
    sm_config : dict
        SM engine config
    ds_config : dict
        Dataset config
    """
    def __init__(self, sc, sm_config, ds_config):
        self.sc = sc
        self.sm_config = sm_config
        self.ds_config = ds_config

        self.theor_peaks_tmp_dir = join(sm_config['fs']['base_path'],
                                        'tmp_theor_peaks_gen')
        self.db = DB(sm_config['db'])

        self.adducts = self.ds_config['isotope_generation']['adducts']

        self.isocalc_wrapper = IsocalcWrapper(
            self.ds_config['isotope_generation'])

    @staticmethod
    def _sf_elements(sf):
        return [
            seg.element().name() for seg in parseSumFormula(sf).get_segments()
        ]

    @classmethod
    def _valid_sf_adduct(cls, sf, adduct):
        if sf is None or adduct is None or sf == 'None' or adduct == 'None':
            logger.warning('Invalid sum formula or adduct: sf=%s, adduct=%s',
                           sf, adduct)
            return False

        if '-' in adduct and adduct.strip('-') not in cls._sf_elements(sf):
            logger.info(
                'No negative adduct element in the sum formula: sf=%s, adduct=%s',
                sf, adduct)
            return False

        return True

    def run(self):
        """ Starts peaks generation. Checks all formula peaks saved in the database and
        generates peaks only for new ones"""
        logger.info('Running theoretical peaks generation')

        db_id = self.db.select_one(DB_ID_SEL,
                                   self.ds_config['database']['name'])[0]
        formula_list = self.apply_database_filters(
            self.db.select(AGG_FORMULA_SEL, db_id))

        stored_sf_adduct = self.db.select(SF_ADDUCT_SEL, db_id,
                                          self.isocalc_wrapper.sigma,
                                          self.isocalc_wrapper.charge,
                                          self.isocalc_wrapper.pts_per_mz)

        sf_adduct_cand = self.find_sf_adduct_cand(formula_list,
                                                  set(stored_sf_adduct))
        logger.info('%d saved (sf, adduct)s, %s not saved (sf, adduct)s',
                    len(stored_sf_adduct), len(sf_adduct_cand))

        if sf_adduct_cand:
            self.generate_theor_peaks(sf_adduct_cand)

    def apply_database_filters(self, formula_list):
        """ Filters according to settings in dataset config

        Args
        ----
        formula_list : list
            List of pairs (id, sum formula) to search through

        Returns
        -------
        : list
            Filtered list of pairs (id, sum formula)
        """
        if 'organic' in map(lambda s: s.lower(),
                            self.ds_config['database'].get('filters', [])):
            logger.info('Organic sum formula filter has been applied')
            return filter(lambda (_, sf): 'C' in self._sf_elements(sf),
                          formula_list)
        return formula_list

    def find_sf_adduct_cand(self, formula_list, stored_sf_adduct):
        """
        Args
        ----
        formula_list : list
            List of pairs (id, sum formula) to search through
        stored_sf_adduct : set
            Set of (formula, adduct) pairs which have theoretical patterns saved in the database

        Returns
        -------
        : list
            List of (formula id, formula, adduct) triples which don't have theoretical patterns saved in the database
        """
        assert formula_list, 'Emtpy agg_formula table!'
        adducts = set(self.adducts) | set(DECOY_ADDUCTS)
        cand = [(id, sf, a) for (id, sf) in formula_list for a in adducts]
        return filter(
            lambda (sf_id, sf, adduct): (sf, adduct) not in stored_sf_adduct,
            cand)

    def generate_theor_peaks(self, sf_adduct_cand):
        """
        Args
        ----
        sf_adduct_cand : list
            List of (formula id, formula, adduct) triples which don't have theoretical patterns saved in the database

        Returns
        -------
        : list
            List of strings with formatted theoretical peaks data
        """
        logger.info('Generating missing peaks')
        formatted_iso_peaks = self.isocalc_wrapper.formatted_iso_peaks
        db_id = self.db.select_one(DB_ID_SEL,
                                   self.ds_config['database']['name'])[0]
        n = 10000
        for i in xrange(0, len(sf_adduct_cand), n):
            sf_adduct_cand_rdd = self.sc.parallelize(sf_adduct_cand[i:i + n],
                                                     numSlices=128)
            peak_lines = (sf_adduct_cand_rdd.flatMap(
                lambda (sf_id, sf, adduct): formatted_iso_peaks(
                    db_id, sf_id, sf, adduct)).collect())
            self._import_theor_peaks_to_db(peak_lines)

    def _import_theor_peaks_to_db(self, peak_lines):
        logger.info('Saving new peaks to the DB')
        if not exists(self.theor_peaks_tmp_dir):
            makedirs(self.theor_peaks_tmp_dir)

        peak_lines_path = join(self.theor_peaks_tmp_dir, 'peak_lines.csv')
        with open(peak_lines_path, 'w') as f:
            f.write('\n'.join(peak_lines))

        with open(peak_lines_path) as peaks_file:
            self.db.copy(peaks_file, 'theor_peaks')
コード例 #32
0
class SearchJob(object):
    """ Main class responsible for molecule search. Uses other modules of the engine.

    Args
    ----
    no_clean : bool
        Don't delete interim data files
    """
    def __init__(self, img_store=None, no_clean=False):
        self.no_clean = no_clean
        self._img_store = img_store

        self._job_id = None
        self._sc = None
        self._db = None
        self._ds = None
        self._ds_reader = None
        self._status_queue = None
        self._fdr = None
        self._wd_manager = None
        self._es = None

        self._sm_config = SMConfig.get_conf()

        logger.debug('Using SM config:\n%s', pformat(self._sm_config))

    def _configure_spark(self):
        logger.info('Configuring Spark')
        sconf = SparkConf()
        for prop, value in self._sm_config['spark'].items():
            if prop.startswith('spark.'):
                sconf.set(prop, value)

        if 'aws' in self._sm_config:
            sconf.set("spark.hadoop.fs.s3a.access.key", self._sm_config['aws']['aws_access_key_id'])
            sconf.set("spark.hadoop.fs.s3a.secret.key", self._sm_config['aws']['aws_secret_access_key'])
            sconf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
            sconf.set("spark.hadoop.fs.s3a.endpoint", "s3.{}.amazonaws.com".format(self._sm_config['aws']['aws_region']))

        self._sc = SparkContext(master=self._sm_config['spark']['master'], conf=sconf, appName='SM engine')

    def _init_db(self):
        logger.info('Connecting to the DB')
        self._db = DB(self._sm_config['db'])

    def store_job_meta(self, mol_db_id):
        """ Store search job metadata in the database """
        logger.info('Storing job metadata')
        rows = [(mol_db_id, self._ds.id, 'STARTED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))]
        self._job_id = self._db.insert_return(JOB_INS, rows=rows)[0]

    def _run_annotation_job(self, mol_db):
        try:
            self.store_job_meta(mol_db.id)
            mol_db.set_job_id(self._job_id)

            logger.info("Running new job ds_id: %s, ds_name: %s, db_name: %s, db_version: %s",
                        self._ds.id, self._ds.name, mol_db.name, mol_db.version)

            target_adducts = self._ds.config['isotope_generation']['adducts']
            self._fdr = FDR(job_id=self._job_id,
                            decoy_sample_size=20,
                            target_adducts=target_adducts,
                            db=self._db)

            isocalc = IsocalcWrapper(self._ds.config['isotope_generation'])
            centroids_gen = IonCentroidsGenerator(sc=self._sc, moldb_name=mol_db.name, isocalc=isocalc)
            polarity = self._ds.config['isotope_generation']['charge']['polarity']
            all_adducts = list(set(self._sm_config['defaults']['adducts'][polarity]) | set(DECOY_ADDUCTS))
            centroids_gen.generate_if_not_exist(isocalc=isocalc,
                                                sfs=mol_db.sfs,
                                                adducts=all_adducts)
            target_ions = centroids_gen.ions(target_adducts)
            self._fdr.decoy_adducts_selection(target_ions)

            search_alg = MSMBasicSearch(sc=self._sc, ds=self._ds, ds_reader=self._ds_reader,
                                        mol_db=mol_db, centr_gen=centroids_gen,
                                        fdr=self._fdr, ds_config=self._ds.config)
            ion_metrics_df, ion_iso_images = search_alg.search()

            search_results = SearchResults(mol_db.id, self._job_id, search_alg.metrics.keys())
            mask = self._ds_reader.get_2d_sample_area_mask()
            img_store_type = self._ds.get_ion_img_storage_type(self._db)
            search_results.store(ion_metrics_df, ion_iso_images, mask, self._db, self._img_store, img_store_type)
        except Exception as e:
            self._db.alter(JOB_UPD, params=('FAILED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id))
            msg = 'Job failed(ds_id={}, mol_db={}): {}'.format(self._ds.id, mol_db, str(e))
            raise JobFailedError(msg) from e
        else:
            self._export_search_results_to_es(mol_db, isocalc)

    def _export_search_results_to_es(self, mol_db, isocalc):
        try:
            self._es.index_ds(self._ds.id, mol_db, isocalc)
        except Exception as e:
            self._db.alter(JOB_UPD, params=('FAILED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id))
            msg = 'Export to ES failed(ds_id={}, mol_db={}): {}'.format(self._ds.id, mol_db, str(e))
            raise ESExportFailedError(msg) from e
        else:
            self._db.alter(JOB_UPD, params=('FINISHED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id))

    def _remove_annotation_job(self, mol_db):
        logger.info("Removing job results ds_id: %s, ds_name: %s, db_name: %s, db_version: %s",
                    self._ds.id, self._ds.name, mol_db.name, mol_db.version)
        self._db.alter('DELETE FROM job WHERE ds_id = %s and db_id = %s', params=(self._ds.id, mol_db.id))
        self._es.delete_ds(self._ds.id, mol_db)

    def _moldb_ids(self):
        moldb_service = MolDBServiceWrapper(self._sm_config['services']['mol_db'])
        completed_moldb_ids = {moldb_service.find_db_by_id(db_id)['id']
                               for (_, db_id) in self._db.select(JOB_ID_MOLDB_ID_SEL, params=(self._ds.id,))}
        new_moldb_ids = {moldb_service.find_db_by_name_version(moldb_name)[0]['id']
                         for moldb_name in self._ds.config['databases']}
        return completed_moldb_ids, new_moldb_ids

    def _save_data_from_raw_ms_file(self):
        ms_file_type_config = SMConfig.get_ms_file_handler(self._wd_manager.local_dir.ms_file_path)
        acq_geometry_factory_module = ms_file_type_config['acq_geometry_factory']
        acq_geometry_factory = getattr(import_module(acq_geometry_factory_module['path']),
                                                acq_geometry_factory_module['name'])

        acq_geometry = acq_geometry_factory(self._wd_manager.local_dir.ms_file_path).create()
        self._ds.save_acq_geometry(self._db, acq_geometry)

        self._ds.save_ion_img_storage_type(self._db, ms_file_type_config['img_storage_type'])

    def run(self, ds):
        """ Entry point of the engine. Molecule search is completed in several steps:
            * Copying input data to the engine work dir
            * Conversion input mass spec files to plain text format. One line - one spectrum data
            * Generation and saving to the database theoretical peaks for all formulas from the molecule database
            * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner.
            * Saving results (isotope images and their metrics of quality for each putative molecule) to the database

        Args
        ----
            ds : sm.engine.dataset_manager.Dataset
        """
        try:
            start = time.time()

            self._init_db()
            self._es = ESExporter(self._db)
            self._ds = ds

            if self._sm_config['rabbitmq']:
                self._status_queue = QueuePublisher(config=self._sm_config['rabbitmq'],
                                                    qdesc=SM_DS_STATUS,
                                                    logger=logger)
            else:
                self._status_queue = None
            ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.STARTED)

            self._wd_manager = WorkDirManager(ds.id)
            self._configure_spark()

            if not self.no_clean:
                self._wd_manager.clean()

            self._ds_reader = DatasetReader(self._ds.input_path, self._sc, self._wd_manager)
            self._ds_reader.copy_convert_input_data()

            self._save_data_from_raw_ms_file()
            self._img_store.storage_type = self._ds.get_ion_img_storage_type(self._db)

            ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.STARTED)

            logger.info('Dataset config:\n%s', pformat(self._ds.config))

            completed_moldb_ids, new_moldb_ids = self._moldb_ids()
            for moldb_id in completed_moldb_ids.symmetric_difference(new_moldb_ids):  # ignore ids present in both sets
                mol_db = MolecularDB(id=moldb_id, db=self._db,
                                     iso_gen_config=self._ds.config['isotope_generation'])
                if moldb_id not in new_moldb_ids:
                    self._remove_annotation_job(mol_db)
                elif moldb_id not in completed_moldb_ids:
                    self._run_annotation_job(mol_db)

            ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.FINISHED)

            logger.info("All done!")
            time_spent = time.time() - start
            logger.info('Time spent: %d mins %d secs', *divmod(int(round(time_spent)), 60))
        except Exception as e:
            if self._ds:
                ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.FAILED)
            logger.error(e, exc_info=True)
            raise
        finally:
            if self._sc:
                self._sc.stop()
            if self._db:
                self._db.close()
            if self._wd_manager and not self.no_clean:
                self._wd_manager.clean()
            logger.info('*' * 150)
コード例 #33
0
              "JOIN agg_formula f ON f.id = m.sf_id AND sf_db.id = f.db_id "
              "JOIN job j ON j.id = m.job_id "
              "JOIN dataset ds ON ds.id = j.ds_id "
              "JOIN theor_peaks tp ON tp.db_id = sf_db.id AND tp.sf_id = m.sf_id AND tp.adduct = m.adduct "
              "WHERE sf_db.name = %s AND ds.name = %s "
              "AND ROUND(sigma::numeric, 6) = %s AND charge = %s AND pts_per_mz = %s")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Exporting search results into a csv file')
    parser.add_argument('ds_name', type=str, help='Dataset name')
    parser.add_argument('csv_path', type=str, help='Path for the csv file')
    parser.add_argument('--config', dest='sm_config_path', type=str, help='SM config path')
    parser.set_defaults(sm_config_path=path.join(proj_root(), 'conf/config.json'))
    args = parser.parse_args()

    SMConfig.set_path(args.sm_config_path)
    db = DB(SMConfig.get_conf()['db'])

    ds_config = db.select_one(DS_CONFIG_SEL, args.ds_name)[0]
    isotope_gen_config = ds_config['isotope_generation']
    charge = '{}{}'.format(isotope_gen_config['charge']['polarity'], isotope_gen_config['charge']['n_charges'])
    export_rs = db.select(EXPORT_SEL, ds_config['database']['name'], args.ds_name,
                          isotope_gen_config['isocalc_sigma'], charge, isotope_gen_config['isocalc_pts_per_mz'])

    header = ','.join(['formula_db', 'ds_name', 'sf', 'adduct', 'chaos', 'img_corr', 'pat_match',
                       'isocalc_sigma', 'isocalc_charge', 'isocalc_pts_per_mz', 'first_peak_mz']) + '\n'
    with open(args.csv_path, 'w') as f:
        f.write(header)
        f.writelines([','.join(map(str, row)) + '\n' for row in export_rs])
    logger.info('Exported all search results for "%s" dataset into "%s" file', args.ds_name, args.csv_path)
コード例 #34
0
def test_sm_daemons(get_compute_img_metrics_mock, filter_sf_metrics_mock,
                    post_images_to_annot_service_mock,
                    MolDBServiceWrapperMock,
                    sm_config, test_db, es_dsl_search, clean_isotope_storage):
    init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock)

    get_compute_img_metrics_mock.return_value = lambda *args: (0.9, 0.9, 0.9, [100.], [0], [10.])
    filter_sf_metrics_mock.side_effect = lambda x: x

    url_dict = {
        'iso_image_ids': ['iso_image_1', None, None, None]
    }
    post_images_to_annot_service_mock.return_value = {
        35: url_dict,
        44: url_dict
    }

    db = DB(sm_config['db'])
    es = ESExporter(db)
    annotate_daemon = None
    update_daemon = None

    try:
        ds_config_str = open(ds_config_path).read()
        upload_dt = datetime.now()
        ds_id = '2000-01-01_00h00m'
        db.insert(Dataset.DS_INSERT, [{
            'id': ds_id,
            'name': test_ds_name,
            'input_path': input_dir_path,
            'upload_dt': upload_dt,
            'metadata': '{"Data_Type": "Imaging MS"}',
            'config': ds_config_str,
            'status': DatasetStatus.QUEUED,
            'is_public': True,
            'mol_dbs': ['HMDB-v4'],
            'adducts': ['+H', '+Na', '+K'],
            'ion_img_storage': 'fs'
        }])

        ds = Dataset.load(db, ds_id)
        queue_pub.publish({'ds_id': ds.id, 'ds_name': ds.name, 'action': 'annotate'})

        run_daemons(db, es)

        # dataset table asserts
        rows = db.select('SELECT id, name, input_path, upload_dt, status from dataset')
        input_path = join(dirname(__file__), 'data', test_ds_name)
        assert len(rows) == 1
        assert rows[0] == (ds_id, test_ds_name, input_path, upload_dt, DatasetStatus.FINISHED)

        # ms acquisition geometry asserts
        rows = db.select('SELECT acq_geometry from dataset')
        assert len(rows) == 1
        assert rows[0][0] == ds.get_acq_geometry(db)
        assert rows[0][0] == {
            ACQ_GEOMETRY_KEYS.LENGTH_UNIT: 'nm',
            ACQ_GEOMETRY_KEYS.AcqGridSection.section_name: {
                ACQ_GEOMETRY_KEYS.AcqGridSection.REGULAR_GRID: True,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_X : 3,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_Y : 3,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_X : 100,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_Y : 100
            },
            ACQ_GEOMETRY_KEYS.PixelSizeSection.section_name: {
                ACQ_GEOMETRY_KEYS.PixelSizeSection.REGULAR_SIZE: True,
                ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_X : 100,
                ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_Y : 100
            }
        }

        # job table asserts
        rows = db.select('SELECT db_id, ds_id, status, start, finish from job')
        assert len(rows) == 1
        db_id, ds_id, status, start, finish = rows[0]
        assert (db_id, ds_id, status) == (0, '2000-01-01_00h00m', JobStatus.FINISHED)
        assert start < finish

        # image metrics asserts
        rows = db.select(('SELECT db_id, sf, adduct, stats, iso_image_ids '
                          'FROM iso_image_metrics '
                          'ORDER BY sf, adduct'))

        assert rows[0] == (0, 'C12H24O', '+K', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9,
                                                'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]},
                           ['iso_image_1', None, None, None])
        assert rows[1] == (0, 'C12H24O', '+Na', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9,
                                                 'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]},
                           ['iso_image_1', None, None, None])

        time.sleep(1)  # Waiting for ES
        # ES asserts
        ds_docs = es_dsl_search.query('term', _type='dataset').execute().to_dict()['hits']['hits']
        assert 1 == len(ds_docs)
        ann_docs = es_dsl_search.query('term', _type='annotation').execute().to_dict()['hits']['hits']
        assert len(ann_docs) == len(rows)
        for doc in ann_docs:
            assert doc['_id'].startswith(ds_id)

    finally:
        db.close()
        if annotate_daemon:
            annotate_daemon.stop()
        if update_daemon:
            update_daemon.stop()
        with warn_only():
            local('rm -rf {}'.format(data_dir_path))
コード例 #35
0
ファイル: search_job.py プロジェクト: metaspace2020/offsample
class SearchJob(object):
    """ Main class responsible for molecule search. Uses other modules of the engine.

    Args
    ----
    no_clean : bool
        Don't delete interim data files
    """
    def __init__(self, img_store=None, no_clean=False):
        self.no_clean = no_clean
        self._img_store = img_store

        self._job_id = None
        self._sc = None
        self._db = None
        self._ds = None
        self._ds_reader = None
        self._status_queue = None
        self._fdr = None
        self._wd_manager = None
        self._es = None

        self._sm_config = SMConfig.get_conf()

        logger.debug('Using SM config:\n%s', pformat(self._sm_config))

    def _configure_spark(self):
        logger.info('Configuring Spark')
        sconf = SparkConf()
        for prop, value in self._sm_config['spark'].items():
            if prop.startswith('spark.'):
                sconf.set(prop, value)

        if 'aws' in self._sm_config:
            sconf.set("spark.hadoop.fs.s3a.access.key",
                      self._sm_config['aws']['aws_access_key_id'])
            sconf.set("spark.hadoop.fs.s3a.secret.key",
                      self._sm_config['aws']['aws_secret_access_key'])
            sconf.set("spark.hadoop.fs.s3a.impl",
                      "org.apache.hadoop.fs.s3a.S3AFileSystem")
            sconf.set(
                "spark.hadoop.fs.s3a.endpoint", "s3.{}.amazonaws.com".format(
                    self._sm_config['aws']['aws_region']))

        self._sc = SparkContext(master=self._sm_config['spark']['master'],
                                conf=sconf,
                                appName='SM engine')

    def _init_db(self):
        logger.info('Connecting to the DB')
        self._db = DB(self._sm_config['db'])

    def store_job_meta(self, mol_db_id):
        """ Store search job metadata in the database """
        logger.info('Storing job metadata')
        rows = [(mol_db_id, self._ds.id, JobStatus.RUNNING,
                 datetime.now().strftime('%Y-%m-%d %H:%M:%S'))]
        self._job_id = self._db.insert_return(JOB_INS, rows=rows)[0]

    def _run_annotation_job(self, mol_db):
        try:
            self.store_job_meta(mol_db.id)
            mol_db.set_job_id(self._job_id)

            logger.info(
                "Running new job ds_id: %s, ds_name: %s, db_name: %s, db_version: %s",
                self._ds.id, self._ds.name, mol_db.name, mol_db.version)

            target_adducts = self._ds.config['isotope_generation']['adducts']
            self._fdr = FDR(job_id=self._job_id,
                            decoy_sample_size=20,
                            target_adducts=target_adducts,
                            db=self._db)

            isocalc = IsocalcWrapper(self._ds.config['isotope_generation'])
            centroids_gen = IonCentroidsGenerator(sc=self._sc,
                                                  moldb_name=mol_db.name,
                                                  isocalc=isocalc)
            polarity = self._ds.config['isotope_generation']['charge'][
                'polarity']
            all_adducts = list(
                set(self._sm_config['defaults']['adducts'][polarity])
                | set(DECOY_ADDUCTS))
            centroids_gen.generate_if_not_exist(isocalc=isocalc,
                                                sfs=mol_db.sfs,
                                                adducts=all_adducts)
            target_ions = centroids_gen.ions(target_adducts)
            self._fdr.decoy_adducts_selection(target_ions)

            search_alg = MSMBasicSearch(sc=self._sc,
                                        ds=self._ds,
                                        ds_reader=self._ds_reader,
                                        mol_db=mol_db,
                                        centr_gen=centroids_gen,
                                        fdr=self._fdr,
                                        ds_config=self._ds.config)
            ion_metrics_df, ion_iso_images = search_alg.search()

            search_results = SearchResults(mol_db.id, self._job_id,
                                           search_alg.metrics.keys())
            mask = self._ds_reader.get_2d_sample_area_mask()
            img_store_type = self._ds.get_ion_img_storage_type(self._db)
            search_results.store(ion_metrics_df, ion_iso_images, mask,
                                 self._db, self._img_store, img_store_type)
        except Exception as e:
            self._db.alter(
                JOB_UPD_STATUS_FINISH,
                params=(JobStatus.FAILED,
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        self._job_id))
            msg = 'Job failed(ds_id={}, mol_db={}): {}'.format(
                self._ds.id, mol_db, str(e))
            raise JobFailedError(msg) from e
        else:
            self._db.alter(
                JOB_UPD_STATUS_FINISH,
                params=(JobStatus.FINISHED,
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        self._job_id))

    def _remove_annotation_job(self, mol_db):
        logger.info(
            "Removing job results ds_id: %s, ds_name: %s, db_name: %s, db_version: %s",
            self._ds.id, self._ds.name, mol_db.name, mol_db.version)
        self._db.alter('DELETE FROM job WHERE ds_id = %s and db_id = %s',
                       params=(self._ds.id, mol_db.id))
        self._es.delete_ds(self._ds.id, mol_db)

    def _moldb_ids(self):
        completed_moldb_ids = {
            db_id
            for (_, db_id) in self._db.select(JOB_ID_MOLDB_ID_SEL,
                                              params=(self._ds.id, ))
        }
        new_moldb_ids = {
            MolecularDB(name=moldb_name).id
            for moldb_name in self._ds.config['databases']
        }
        return completed_moldb_ids, new_moldb_ids

    def _save_data_from_raw_ms_file(self):
        ms_file_type_config = SMConfig.get_ms_file_handler(
            self._wd_manager.local_dir.ms_file_path)
        acq_geometry_factory_module = ms_file_type_config[
            'acq_geometry_factory']
        acq_geometry_factory = getattr(
            import_module(acq_geometry_factory_module['path']),
            acq_geometry_factory_module['name'])

        acq_geometry = acq_geometry_factory(
            self._wd_manager.local_dir.ms_file_path).create()
        self._ds.save_acq_geometry(self._db, acq_geometry)

        self._ds.save_ion_img_storage_type(
            self._db, ms_file_type_config['img_storage_type'])

    def run(self, ds):
        """ Entry point of the engine. Molecule search is completed in several steps:
            * Copying input data to the engine work dir
            * Conversion input mass spec files to plain text format. One line - one spectrum data
            * Generation and saving to the database theoretical peaks for all formulas from the molecule database
            * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner.
            * Saving results (isotope images and their metrics of quality for each putative molecule) to the database

        Args
        ----
            ds : sm.engine.dataset_manager.Dataset
        """
        try:
            logger.info('*' * 150)
            start = time.time()

            self._init_db()
            self._es = ESExporter(self._db)
            self._ds = ds

            if self._sm_config['rabbitmq']:
                self._status_queue = QueuePublisher(
                    config=self._sm_config['rabbitmq'],
                    qdesc=SM_DS_STATUS,
                    logger=logger)
            else:
                self._status_queue = None

            self._wd_manager = WorkDirManager(ds.id)
            self._configure_spark()

            if not self.no_clean:
                self._wd_manager.clean()

            self._ds_reader = DatasetReader(self._ds.input_path, self._sc,
                                            self._wd_manager)
            self._ds_reader.copy_convert_input_data()

            self._save_data_from_raw_ms_file()
            self._img_store.storage_type = self._ds.get_ion_img_storage_type(
                self._db)

            logger.info('Dataset config:\n%s', pformat(self._ds.config))

            completed_moldb_ids, new_moldb_ids = self._moldb_ids()
            for moldb_id in completed_moldb_ids.symmetric_difference(
                    new_moldb_ids):  # ignore ids present in both sets
                mol_db = MolecularDB(
                    id=moldb_id,
                    db=self._db,
                    iso_gen_config=self._ds.config['isotope_generation'])
                if moldb_id not in new_moldb_ids:
                    self._remove_annotation_job(mol_db)
                elif moldb_id not in completed_moldb_ids:
                    self._run_annotation_job(mol_db)

            logger.info("All done!")
            time_spent = time.time() - start
            logger.info('Time spent: %d mins %d secs',
                        *divmod(int(round(time_spent)), 60))
        finally:
            if self._sc:
                self._sc.stop()
            if self._db:
                self._db.close()
            if self._wd_manager and not self.no_clean:
                self._wd_manager.clean()
            logger.info('*' * 150)
コード例 #36
0
def run_coloc_jobs(
    sm_config, ds_id_str, sql_where, fix_missing, fix_corrupt, skip_existing, use_lithops
):
    assert (
        len(
            [
                data_source
                for data_source in [ds_id_str, sql_where, fix_missing, fix_corrupt]
                if data_source
            ]
        )
        == 1
    ), "Exactly one data source (ds_id, sql_where, fix_missing, fix_corrupt) must be specified"
    assert not (ds_id_str and sql_where)

    db = DB()

    if ds_id_str:
        ds_ids = ds_id_str.split(',')
    elif sql_where:
        ds_ids = [
            id for (id,) in db.select(f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}')
        ]
    else:
        mol_dbs = [
            (doc['id'], doc['name'])
            for doc in db.select_with_fields('SELECT id, name FROM molecular_db m')
        ]
        mol_db_ids, mol_db_names = map(list, zip(*mol_dbs))
        fdrs = [0.05, 0.1, 0.2, 0.5]
        algorithms = ['median_thresholded_cosine', 'cosine']

        if fix_missing:
            logger.info('Checking for missing colocalization jobs...')
            results = db.select(
                MISSING_COLOC_JOBS_SEL, [list(mol_db_ids), list(mol_db_names), fdrs, algorithms]
            )
            ds_ids = [ds_id for ds_id, in results]
            logger.info(f'Found {len(ds_ids)} missing colocalization sets')
        else:
            logger.info(
                'Checking all colocalization jobs. '
                'This is super slow: ~5 minutes per 1000 datasets...'
            )
            results = db.select(
                CORRUPT_COLOC_JOBS_SEL, [list(mol_db_ids), list(mol_db_names), fdrs, algorithms]
            )
            ds_ids = [ds_id for ds_id, in results]
            logger.info(f'Found {len(ds_ids)} corrupt colocalization sets')

    if not ds_ids:
        logger.warning('No datasets match filter')
        return

    if use_lithops:
        executor = Executor(sm_config['lithops'])

    for i, ds_id in enumerate(ds_ids):
        try:
            logger.info(f'Running colocalization on {i+1} out of {len(ds_ids)}')
            ds = Dataset.load(db, ds_id)
            coloc = Colocalization(db)
            if use_lithops:
                # noinspection PyUnboundLocalVariable
                coloc.run_coloc_job_lithops(executor, ds, reprocess=not skip_existing)
            else:
                coloc.run_coloc_job(ds, reprocess=not skip_existing)
        except Exception:
            logger.error(f'Failed to run colocalization on {ds_id}', exc_info=True)
コード例 #37
0
def test_sm_daemons(
    MSMSearchMock,
    call_off_sample_api_mock,
    post_images_to_image_store_mock,
    get_image_mock,
    get_ion_images_for_analysis_mock,
    # fixtures
    test_db,
    es_dsl_search,
    clean_isotope_storage,
    reset_queues,
    metadata,
    ds_config,
    queue_pub,
    local_sm_config,
):
    moldb = init_moldb()

    formula_metrics_df = pd.DataFrame({
        'formula_i': [0, 1, 2],
        'ion_formula': ['C12H24O-H2O+H', 'C12H24O-H2+O2-CO+Na', 'C12H24O'],
        'formula': ['C12H24O', 'C12H24O', 'C12H24O'],
        'modifier': ['-H2O+H', '-H2+O2-CO+Na', ''],
        'chem_mod': ['', '-H2+O2', ''],
        'neutral_loss': ['-H2O', '-CO', ''],
        'adduct': ['+H', '+Na', '[M]+'],
        'chaos': [0.9, 0.9, 0.9],
        'spatial': [0.9, 0.9, 0.9],
        'spectral': [0.9, 0.9, 0.9],
        'msm': [0.9**3, 0.9**3, 0.9**3],
        'total_iso_ints': [[100.0], [100.0], [100.0]],
        'min_iso_ints': [[0], [0], [0]],
        'max_iso_ints': [[10.0], [10.0], [10.0]],
        'fdr': [0.1, 0.1, 0.1],
    }).set_index('formula_i')
    search_algo_mock = MSMSearchMock()

    def mock_search(*args):
        # Read all spectra so that ImzML diagnostic fields are populated
        imzml_reader = MSMSearchMock.call_args_list[-1][1]['imzml_reader']
        _ = list(imzml_reader.iter_spectra(range(imzml_reader.n_spectra)))
        return [(formula_metrics_df, [], create_test_fdr_diagnostics_bundle())]

    search_algo_mock.search.side_effect = mock_search
    search_algo_mock.metrics = OrderedDict([
        ('chaos', 0),
        ('spatial', 0),
        ('spectral', 0),
        ('msm', 0),
        ('total_iso_ints', []),
        ('min_iso_ints', []),
        ('max_iso_ints', []),
    ])

    image_ids = ['iso_image_1', None, None, None]
    post_images_to_image_store_mock.return_value = {
        0: image_ids,
        1: image_ids,
        2: image_ids
    }

    db = DB()
    es = ESExporter(db, local_sm_config)

    ds = create_test_ds(
        name=test_ds_name,
        input_path=input_dir_path,
        config={
            **ds_config, 'database_ids': [moldb.id]
        },
        status=DatasetStatus.QUEUED,
        es=es,
    )

    queue_pub.publish({
        'ds_id': ds.id,
        'ds_name': test_ds_name,
        'action': DaemonAction.ANNOTATE
    })

    run_daemons(db, es, local_sm_config)

    # dataset table asserts
    rows = db.select(
        'SELECT id, name, input_path, upload_dt, status from dataset')
    input_path = join(dirname(__file__), 'data', test_ds_name)
    assert len(rows) == 1
    assert rows[0] == (ds.id, test_ds_name, input_path, ds.upload_dt,
                       DatasetStatus.FINISHED)

    # ms acquisition geometry asserts
    rows = db.select('SELECT acq_geometry from dataset')
    assert len(rows) == 1
    assert rows[0][0] == ds.get_acq_geometry(db)
    assert rows[0][0] == {
        'length_unit': 'nm',
        'acquisition_grid': {
            'regular_grid': True,
            'count_x': 3,
            'count_y': 3
        },
        'pixel_size': {
            'regular_size': True,
            'size_x': 100,
            'size_y': 100
        },
    }

    # job table asserts
    rows = db.select('SELECT moldb_id, ds_id, status, start, finish from job')
    assert len(rows) == 1
    moldb_id, ds_id, status, start, finish = rows[0]
    assert (moldb_id, ds_id, status) == (moldb.id, ds.id, JobStatus.FINISHED)
    assert start <= finish

    # image metrics asserts
    rows = db.select(
        'SELECT formula, adduct, msm, stats, iso_image_ids FROM annotation')
    rows = sorted(
        rows, key=lambda row: row[1]
    )  # Sort in Python because postgres sorts symbols inconsistently between locales
    assert len(rows) == 3
    for row, expected_adduct in zip(rows, ['+H', '+Na', '[M]+']):
        formula, adduct, msm, stats, iso_image_ids = row
        assert formula == 'C12H24O'
        assert adduct == expected_adduct
        assert np.isclose(msm, 0.9**3)
        assert stats == {
            'chaos': 0.9,
            'spatial': 0.9,
            'spectral': 0.9,
            'total_iso_ints': [100.0],
            'min_iso_ints': [0],
            'max_iso_ints': [10.0],
        }
        assert iso_image_ids == ['iso_image_1', None, None, None]

    time.sleep(1)  # Waiting for ES
    # ES asserts
    ds_docs = es_dsl_search.query(
        'term', _type='dataset').execute().to_dict()['hits']['hits']
    assert 1 == len(ds_docs)
    ann_docs = es_dsl_search.query(
        'term', _type='annotation').execute().to_dict()['hits']['hits']
    assert len(ann_docs) == len(rows)
    for doc in ann_docs:
        assert doc['_id'].startswith(ds_id)
コード例 #38
0
class TheorPeaksGenerator(object):
    """ Generator of theoretical isotope peaks for all molecules in a database.

    Args
    ----------
    sc : pyspark.SparkContext
    sm_config : dict
        SM engine config
    ds_config : dict
        Dataset config
    """
    def __init__(self, sc, sm_config, ds_config):  # TODO: replace sm_config with db
        self.sc = sc
        self.sm_config = sm_config
        self.ds_config = ds_config

        self.theor_peaks_tmp_dir = join(sm_config['fs']['base_path'], 'tmp_theor_peaks_gen')
        self.db = DB(sm_config['db'])

        self.adducts = self.ds_config['isotope_generation']['adducts']

        self.isocalc_wrapper = IsocalcWrapper(self.ds_config['isotope_generation'])

    @staticmethod
    def _sf_elements(sf):
        return [seg.element().name() for seg in parseSumFormula(sf).get_segments()]

    @classmethod
    def _valid_sf_adduct(cls, sf, adduct):
        if sf is None or adduct is None or sf == 'None' or adduct == 'None':
            logger.warning('Invalid sum formula or adduct: sf=%s, adduct=%s', sf, adduct)
            return False

        if '-' in adduct and adduct.strip('-') not in cls._sf_elements(sf):
            logger.info('No negative adduct element in the sum formula: sf=%s, adduct=%s', sf, adduct)
            return False

        return True

    def run(self):
        """ Starts peaks generation. Checks all formula peaks saved in the database and
        generates peaks only for new ones"""
        logger.info('Running theoretical peaks generation')

        db_id = self.db.select_one(DB_ID_SEL, self.ds_config['database']['name'])[0]
        formula_list = self.apply_database_filters(self.db.select(AGG_FORMULA_SEL, db_id))

        stored_sf_adduct = self.db.select(SF_ADDUCT_SEL, db_id,
                                          self.isocalc_wrapper.sigma,
                                          self.isocalc_wrapper.charge,
                                          self.isocalc_wrapper.pts_per_mz)

        sf_adduct_cand = self.find_sf_adduct_cand(formula_list, set(stored_sf_adduct))
        logger.info('%d saved (sf, adduct)s, %s not saved (sf, adduct)s', len(stored_sf_adduct), len(sf_adduct_cand))

        if sf_adduct_cand:
            self.generate_theor_peaks(sf_adduct_cand)

    def apply_database_filters(self, formula_list):
        """ Filters according to settings in dataset config

        Args
        ----
        formula_list : list
            List of pairs (id, sum formula) to search through

        Returns
        -------
        : list
            Filtered list of pairs (id, sum formula)
        """
        if 'organic' in map(lambda s: s.lower(), self.ds_config['database'].get('filters', [])):
            logger.info('Organic sum formula filter has been applied')
            return filter(lambda (_, sf): 'C' in self._sf_elements(sf), formula_list)
        return formula_list

    def find_sf_adduct_cand(self, formula_list, stored_sf_adduct):
        """
        Args
        ----
        formula_list : list
            List of pairs (id, sum formula) to search through
        stored_sf_adduct : set
            Set of (formula, adduct) pairs which have theoretical patterns saved in the database

        Returns
        -------
        : list
            List of (formula id, formula, adduct) triples which don't have theoretical patterns saved in the database
        """
        assert formula_list, 'Emtpy agg_formula table!'
        adducts = set(self.adducts) | set(DECOY_ADDUCTS)
        cand = [(id, sf, a) for (id, sf) in formula_list for a in adducts]
        return filter(lambda (sf_id, sf, adduct): (sf, adduct) not in stored_sf_adduct, cand)

    def generate_theor_peaks(self, sf_adduct_cand):
        """
        Args
        ----
        sf_adduct_cand : list
            List of (formula id, formula, adduct) triples which don't have theoretical patterns saved in the database

        Returns
        -------
        : list
            List of strings with formatted theoretical peaks data
        """
        logger.info('Generating missing peaks')
        formatted_iso_peaks = self.isocalc_wrapper.formatted_iso_peaks
        db_id = self.db.select_one(DB_ID_SEL, self.ds_config['database']['name'])[0]
        n = 10000
        for i in xrange(0, len(sf_adduct_cand), n):
            sf_adduct_cand_rdd = self.sc.parallelize(sf_adduct_cand[i:i+n], numSlices=128)
            peak_lines = (sf_adduct_cand_rdd
                          .flatMap(lambda (sf_id, sf, adduct): formatted_iso_peaks(db_id, sf_id, sf, adduct))
                          .collect())
            self._import_theor_peaks_to_db(peak_lines)

    def _import_theor_peaks_to_db(self, peak_lines):
        logger.info('Saving new peaks to the DB')
        if not exists(self.theor_peaks_tmp_dir):
            makedirs(self.theor_peaks_tmp_dir)

        peak_lines_path = join(self.theor_peaks_tmp_dir, 'peak_lines.csv')
        with open(peak_lines_path, 'w') as f:
            f.write('\n'.join(peak_lines))

        with open(peak_lines_path) as peaks_file:
            self.db.copy(peaks_file, 'theor_peaks')