def test_search_job_imzml_example_es_export_fails(get_compute_img_metrics_mock, filter_sf_metrics_mock,
                                                  post_images_to_annot_service_mock,
                                                  MolDBServiceWrapperMock, MolDBServiceWrapperMock2,
                                                  sm_config, create_fill_sm_database, es_dsl_search,
                                                  clean_isotope_storage):
    init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock)
    init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock2)

    get_compute_img_metrics_mock.return_value = lambda *args: (0.9, 0.9, 0.9, [100.], [0], [10.])
    filter_sf_metrics_mock.side_effect = lambda x: x

    url_dict = {
        'iso_image_ids': ['iso_image_1', None, None, None]
    }
    post_images_to_annot_service_mock.return_value = {
        35: url_dict,
        44: url_dict
    }

    db = DB(sm_config['db'])

    def throw_exception_function(*args):
        raise Exception('Test')

    try:
        ds_id = '2000-01-01_00h00m'
        upload_dt = datetime.now()
        ds_config_str = open(ds_config_path).read()
        db.insert(Dataset.DS_INSERT, [{
            'id': ds_id,
            'name': test_ds_name,
            'input_path': input_dir_path,
            'upload_dt': upload_dt,
            'metadata': '{}',
            'config': ds_config_str,
            'status': DatasetStatus.QUEUED,
            'is_public': True,
            'mol_dbs': ['HMDB-v4'],
            'adducts': ['+H', '+Na', '+K'],
            'ion_img_storage': 'fs'
        }])

        with patch('sm.engine.search_job.ESExporter.index_ds') as index_ds_mock:
            index_ds_mock.side_effect = throw_exception_function

            img_store = ImageStoreServiceWrapper(sm_config['services']['img_service_url'])
            job = SearchJob(img_store=img_store)
            ds = Dataset.load(db, ds_id)
            job.run(ds)
    except ESExportFailedError as e:
        assert e
        # dataset table asserts
        row = db.select_one('SELECT status from dataset')
        assert row[0] == 'FAILED'
    else:
        raise AssertionError('ESExportFailedError should be raised')
    finally:
        db.close()
        with warn_only():
            local('rm -rf {}'.format(data_dir_path))
コード例 #2
0
def test_save_ds_meta_ds_doesnt_exist(spark_context, create_test_db, drop_test_db, sm_config, ds_config):
    work_dir_man_mock = MagicMock(WorkDirManager)
    work_dir_man_mock.ds_coord_path = '/ds_path'
    work_dir_man_mock.txt_path = '/txt_path'

    SMConfig._config_dict = sm_config

    with patch('sm.engine.tests.util.SparkContext.textFile') as m:
        m.return_value = spark_context.parallelize([
            '0,1,1\n',
            '1,100,200\n'])

        dataset = Dataset(spark_context, 'ds_name', '', ds_config, work_dir_man_mock, DB(sm_config['db']))
        dataset.save_ds_meta()

    db = DB(sm_config['db'])
    ds_row = db.select_one('SELECT name, file_path, img_bounds, config from dataset')
    assert ds_row == ('ds_name', '/txt_path',
                      {u'x': {u'min': 1, u'max': 100}, u'y': {u'min': 1, u'max': 200}},
                      ds_config)

    coord_row = db.select_one('SELECT xs, ys from coordinates')
    assert coord_row == ([1, 100], [1, 200])

    db.close()
コード例 #3
0
def test_save_ds_meta_ds_doesnt_exist(spark_context, create_test_db,
                                      drop_test_db, sm_config, ds_config):
    work_dir_man_mock = MagicMock(WorkDirManager)
    work_dir_man_mock.ds_coord_path = '/ds_path'
    work_dir_man_mock.txt_path = '/txt_path'

    SMConfig._config_dict = sm_config

    with patch('sm.engine.tests.util.SparkContext.textFile') as m:
        m.return_value = spark_context.parallelize(['0,1,1\n', '1,100,200\n'])

        dataset = Dataset(spark_context, 'ds_name', '', 'input_path',
                          ds_config, work_dir_man_mock, DB(sm_config['db']))
        dataset.save_ds_meta()

    db = DB(sm_config['db'])
    ds_row = db.select_one(
        'SELECT name, file_path, img_bounds, config from dataset')
    assert ds_row == ('ds_name', 'input_path', {
        u'x': {
            u'min': 1,
            u'max': 100
        },
        u'y': {
            u'min': 1,
            u'max': 200
        }
    }, ds_config)

    coord_row = db.select_one('SELECT xs, ys from coordinates')
    assert coord_row == ([1, 100], [1, 200])

    db.close()
コード例 #4
0
def test_save_sf_iso_images_correct_db_call(spark_context,
                                            create_fill_sm_database, sm_config,
                                            ds_config):
    sf_iso_imgs = spark_context.parallelize([((1, '+H'), [
        csr_matrix([[100, 0, 0], [0, 0, 0]]),
        csr_matrix([[0, 0, 0], [0, 0, 10]])
    ])])
    sf_adduct_peaksn = [(1, '+H', 2)]
    res = SearchResults(0, 0, 0, 'ds_name', sf_adduct_peaksn, db_mock,
                        sm_config, ds_config)
    res.sf_iso_images = sf_iso_imgs
    res.nrows, res.ncols = 2, 3
    res.store_sf_iso_images()

    correct_rows = [(0, 0, 1, '+H', 0, [0], [100], 0, 100),
                    (0, 0, 1, '+H', 1, [5], [10], 0, 10)]

    db = DB(sm_config['db'])
    try:
        rows = db.select((
            'SELECT job_id, db_id, sf_id, adduct, peak, pixel_inds, intensities, min_int, max_int '
            'FROM iso_image '
            'ORDER BY sf_id, adduct'))
        assert correct_rows == rows
    finally:
        db.close()
コード例 #5
0
 def fin():
     db_config = dict(database='postgres',
                      user='******',
                      host='localhost',
                      password='******')
     db = DB(db_config, autocommit=True)
     db.alter('DROP DATABASE IF EXISTS sm_test')
     db.close()
コード例 #6
0
 def store_iso_img_rows(row_it):
     db = DB(db_config)
     try:
         rows = list(row_it)
         if rows:
             db.insert(SF_ISO_IMGS_INS, rows)
     finally:
         db.close()
コード例 #7
0
ファイル: util.py プロジェクト: frulo/SM_distributed
def create_test_db():
    db_config = dict(database='postgres', user='******', host='localhost')
    db = DB(db_config, autocommit=True)
    db.alter('DROP DATABASE IF EXISTS sm_test')
    db.alter('CREATE DATABASE sm_test')
    db.close()

    local('psql -h localhost -U sm sm_test < {}'.format(join(proj_root(), 'scripts/create_schema.sql')))
コード例 #8
0
def test_sm_daemons_annot_fails(get_compute_img_metrics_mock, filter_sf_metrics_mock,
                                post_images_to_annot_service_mock,
                                MolDBServiceWrapperMock,
                                sm_config, test_db, es_dsl_search,
                                clean_isotope_storage):
    init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock)

    def throw_exception_function(*args):
        raise Exception('Test')
    get_compute_img_metrics_mock.return_value = throw_exception_function
    filter_sf_metrics_mock.side_effect = lambda x: x

    url_dict = {
        'iso_image_ids': ['iso_image_1', None, None, None]
    }
    post_images_to_annot_service_mock.return_value = {
        35: url_dict,
        44: url_dict
    }

    db = DB(sm_config['db'])
    es = ESExporter(db)
    annotate_daemon = None

    try:
        ds_id = '2000-01-01_00h00m'
        upload_dt = datetime.now()
        ds_config_str = open(ds_config_path).read()
        db.insert(Dataset.DS_INSERT, [{
            'id': ds_id,
            'name': test_ds_name,
            'input_path': input_dir_path,
            'upload_dt': upload_dt,
            'metadata': '{}',
            'config': ds_config_str,
            'status': DatasetStatus.QUEUED,
            'is_public': True,
            'mol_dbs': ['HMDB-v4'],
            'adducts': ['+H', '+Na', '+K'],
            'ion_img_storage': 'fs'
        }])

        ds = Dataset.load(db, ds_id)
        queue_pub.publish({'ds_id': ds.id, 'ds_name': ds.name, 'action': 'annotate'})

        run_daemons(db, es)

        # dataset and job tables asserts
        row = db.select_one('SELECT status from dataset')
        assert row[0] == 'FAILED'
        row = db.select_one('SELECT status from job')
        assert row[0] == 'FAILED'
    finally:
        db.close()
        if annotate_daemon:
            annotate_daemon.stop()
        with warn_only():
            local('rm -rf {}'.format(data_dir_path))
コード例 #9
0
def create_test_db():
    db_config = dict(database='postgres', user='******', host='localhost')
    db = DB(db_config, autocommit=True)
    db.alter('DROP DATABASE IF EXISTS sm_test')
    db.alter('CREATE DATABASE sm_test')
    db.close()

    local('psql -h localhost -U sm sm_test < {}'.format(
        join(proj_root(), 'scripts/create_schema.sql')))
コード例 #10
0
 def fin():
     db = DB(db_config, autocommit=True)
     try:
         db.alter('DROP DATABASE IF EXISTS sm_test')
     except Exception as e:
         logging.getLogger('engine').warning(
             'Drop sm_test database failed: %s', e)
     finally:
         db.close()
コード例 #11
0
def fill_test_db(create_test_db, drop_test_db):
    db_config = dict(database='sm_test', user='******', host='localhost', password='******')
    db = DB(db_config)
    try:
        db.alter('TRUNCATE dataset CASCADE')
        db.insert("INSERT INTO dataset VALUES (%s, %s, %s, %s, %s)",
                  [(1, 'ds_name', '/ds_path', json.dumps({}), json.dumps({}))])
        db.alter('TRUNCATE coordinates CASCADE')
    except:
        raise
    finally:
        db.close()
コード例 #12
0
def test_theor_peaks_generator_run_failed_iso_peaks(create_fill_test_db, spark_context, sm_config, ds_config):
    ds_config["isotope_generation"]["adducts"] = ["+Na"]
    theor_peaks_gen = TheorPeaksGenerator(spark_context, sm_config, ds_config)
    theor_peaks_gen.isocalc_wrapper.isotope_peaks = lambda *args: Centroids([], [])
    theor_peaks_gen.run()

    db = DB(sm_config['db'])
    rows = db.select('SELECT * FROM theor_peaks')

    assert len(rows) == 1

    db.close()
コード例 #13
0
def test_search_job_imzml_example(get_compute_img_measures_mock, filter_sf_metrics_mock, create_fill_sm_database, sm_config):
    get_compute_img_measures_mock.return_value = lambda *args: (0.9, 0.9, 0.9)
    filter_sf_metrics_mock.side_effect = lambda x: x

    SMConfig._config_dict = sm_config

    db = DB(sm_config['db'])
    try:
        job = SearchJob(None, 'imzml_example_ds')
        job.run(input_dir_path, ds_config_path, clean=True)

        # dataset meta asserts
        rows = db.select("SELECT name, file_path, img_bounds from dataset")
        img_bounds = {u'y': {u'max': 3, u'min': 1}, u'x': {u'max': 3, u'min': 1}}
        file_path = join(dirname(__file__), 'data', 'imzml_example_ds')
        assert len(rows) == 1
        assert rows[0] == (test_ds_name, file_path, img_bounds)

        # theoretical patterns asserts
        rows = db.select('SELECT db_id, sf_id, adduct, centr_mzs, centr_ints, prof_mzs, prof_ints '
                         'FROM theor_peaks '
                         'ORDER BY adduct')

        assert len(rows) == 3 + len(DECOY_ADDUCTS)
        for r in rows:
            assert r[3] and r[4]

        # image metrics asserts
        rows = db.select(('SELECT db_id, sf_id, adduct, peaks_n, stats FROM iso_image_metrics '
                          'ORDER BY sf_id, adduct'))

        assert rows
        assert rows[0]
        assert tuple(rows[0][:2]) == (0, 10007)
        assert set(rows[0][4].keys()) == {'chaos', 'spatial', 'spectral'}

        # image asserts
        rows = db.select(('SELECT db_id, sf_id, adduct, peak, intensities, min_int, max_int '
                          'FROM iso_image '
                          'ORDER BY sf_id, adduct'))
        assert rows

        max_int = 0.0
        for r in rows:
            max_int = max(max_int, r[-1])
            assert tuple(r[:2]) == (0, 10007)
        assert max_int

    finally:
        db.close()
        with warn_only():
            local('rm -rf {}'.format(data_dir_path))
コード例 #14
0
def test_search_job_imzml_example(get_compute_img_measures_mock, create_fill_sm_database, sm_config):
    get_compute_img_measures_mock.return_value = lambda *args: (0.9, 0.9, 0.9)

    SMConfig._config_dict = sm_config

    db = DB(sm_config['db'])
    try:
        job = SearchJob(None, 'imzml_example_ds')
        job.run(input_dir_path, ds_config_path, clean=True)

        # dataset meta asserts
        rows = db.select("SELECT name, file_path, img_bounds from dataset")
        img_bounds = {u'y': {u'max': 3, u'min': 1}, u'x': {u'max': 3, u'min': 1}}
        file_path = 'file://' + join(data_dir_path, 'ds.txt')
        assert len(rows) == 1
        assert rows[0] == (test_ds_name, file_path, img_bounds)

        # theoretical patterns asserts
        rows = db.select('SELECT db_id, sf_id, adduct, centr_mzs, centr_ints, prof_mzs, prof_ints '
                         'FROM theor_peaks '
                         'ORDER BY adduct')

        assert len(rows) == 3 + len(DECOY_ADDUCTS)
        for r in rows:
            assert r[3] and r[4]

        # image metrics asserts
        rows = db.select(('SELECT db_id, sf_id, adduct, peaks_n, stats FROM iso_image_metrics '
                          'ORDER BY sf_id, adduct'))

        assert rows
        assert rows[0]
        assert tuple(rows[0][:2]) == (0, 10007)
        assert set(rows[0][4].keys()) == {'chaos', 'spatial', 'spectral'}

        # image asserts
        rows = db.select(('SELECT db_id, sf_id, adduct, peak, intensities, min_int, max_int '
                          'FROM iso_image '
                          'ORDER BY sf_id, adduct'))
        assert rows

        max_int = 0.0
        for r in rows:
            max_int = max(max_int, r[-1])
            assert tuple(r[:2]) == (0, 10007)
        assert max_int

    finally:
        db.close()
        with warn_only():
            local('rm -rf {}'.format(data_dir_path))
コード例 #15
0
def fill_test_db(create_test_db, drop_test_db):
    db_config = dict(database='sm_test',
                     user='******',
                     host='localhost',
                     password='******')
    db = DB(db_config)
    try:
        db.alter('TRUNCATE dataset CASCADE')
        db.insert("INSERT INTO dataset VALUES (%s, %s, %s, %s, %s)",
                  [(1, 'ds_name', '/ds_path', json.dumps({}), json.dumps({}))])
        db.alter('TRUNCATE coordinates CASCADE')
    except:
        raise
    finally:
        db.close()
コード例 #16
0
def create_fill_sm_database(create_test_db, drop_test_db, sm_config):
    local('psql -h localhost -U sm sm_test < {}'.format(join(proj_dir_path, 'scripts/create_schema.sql')))

    db = DB(sm_config['db'])
    try:
        db.insert('INSERT INTO formula_db VALUES (%s, %s, %s)',
                  [(0, '2016-01-01', 'HMDB')])
        db.insert('INSERT INTO formula VALUES (%s, %s, %s, %s, %s)',
                  [(100, 0, '00001', 'compound_name', 'C12H24O')])
        db.insert('INSERT INTO agg_formula VALUES (%s, %s, %s, %s, %s)',
                  [(10007, 0, 'C12H24O', ['00001'], ['compound_name'])])
    except:
        raise
    finally:
        db.close()
コード例 #17
0
def create_fill_sm_database(create_test_db, drop_test_db, sm_config):
    local('psql -h localhost -U sm sm_test < {}'.format(join(proj_dir_path, 'scripts/create_schema.sql')))

    db = DB(sm_config['db'])
    try:
        db.insert('INSERT INTO formula_db VALUES (%s, %s, %s)',
                  [(0, '2016-01-01', 'HMDB')])
        db.insert('INSERT INTO formula VALUES (%s, %s, %s, %s, %s)',
                  [(100, 0, '00001', 'compound_name', 'C12H24O')])
        db.insert('INSERT INTO agg_formula VALUES (%s, %s, %s, %s, %s)',
                  [(10007, 0, 'C12H24O', ['00001'], ['compound_name'])])
    except:
        raise
    finally:
        db.close()
コード例 #18
0
def create_fill_test_db(create_test_db, drop_test_db):
    db_config = dict(database='sm_test', user='******', host='localhost', password='******')
    db = DB(db_config)
    try:
        db.alter('TRUNCATE formula_db CASCADE')
        db.insert('INSERT INTO formula_db VALUES (%s, %s, %s)', [(0, '2016-01-01', 'HMDB')])
        db.insert('INSERT INTO formula VALUES (%s, %s, %s, %s, %s)', [(9, 0, '04138', 'Au', 'Gold')])
        db.insert('INSERT INTO agg_formula VALUES (%s, %s, %s, %s, %s)', [(9, 0, 'Au', ['04138'], ['Gold'])])
        db.alter('TRUNCATE theor_peaks CASCADE')
        db.insert('INSERT INTO theor_peaks VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)',
                  [(0, 9, '+H', 0.01, 1, 10000, [100, 200], [10, 1], [], [])])
    except:
        raise
    finally:
        db.close()
コード例 #19
0
def create_fill_sm_database(create_test_db, drop_test_db, sm_config):
    proj_dir_path = dirname(dirname(__file__))
    local('psql -h localhost -U sm sm_test < {}'.format(
        join(proj_dir_path, 'scripts/create_schema.sql')))

    db = DB(sm_config['db'])
    try:
        db.insert('INSERT INTO dataset VALUES (%s, %s, %s, %s, %s, %s)',
                  [(0, 'name', 0, 'fpath', json.dumps({}), json.dumps({}))])
        db.insert('INSERT INTO job VALUES (%s, %s, %s, %s, %s, %s, %s, %s)',
                  [(0, 0, 0, '', 0, 0, None, None)])
    except:
        raise
    finally:
        db.close()
コード例 #20
0
def test_theor_peaks_generator_run_1(create_fill_test_db, spark_context, sm_config, ds_config):
    ds_config["isotope_generation"]["adducts"] = ["+H", "+Na"]
    theor_peaks_gen = TheorPeaksGenerator(spark_context, sm_config, ds_config)
    theor_peaks_gen.isocalc_wrapper.isotope_peaks = lambda *args: Centroids([100., 200.], [10., 1.])
    theor_peaks_gen.run()

    db = DB(sm_config['db'])
    rows = db.select(('SELECT db_id, sf_id, adduct, sigma, charge, pts_per_mz, centr_mzs, '
                      'centr_ints, prof_mzs, prof_ints FROM theor_peaks ORDER BY sf_id, adduct'))

    assert len(rows) == 2 + 80
    assert (filter(lambda r: r[2] == '+H', rows)[0] ==
            (0, 9, '+H', 0.01, 1, 10000, [100., 200.], [10., 1.], [], []))
    assert (filter(lambda r: r[2] == '+Na', rows)[0] ==
            (0, 9, '+Na', 0.01, 1, 10000, [100., 200.], [10., 1.], [], []))

    db.close()
コード例 #21
0
def fill_db(test_db, sm_config, ds_config):
    upload_dt = '2000-01-01 00:00:00'
    ds_id = '2000-01-01'
    meta = {"meta": "data"}
    db = DB(sm_config['db'])
    db.insert(
        'INSERT INTO dataset (id, name, input_path, upload_dt, metadata, config, '
        'status, is_public, mol_dbs, adducts) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)',
        rows=[(ds_id, 'ds_name', 'input_path', upload_dt, json.dumps(meta),
               json.dumps(ds_config), 'FINISHED', True, ['HMDB-v4'], ['+H'])])
    db.insert("INSERT INTO job (id, db_id, ds_id) VALUES (%s, %s, %s)",
              rows=[(0, 0, ds_id)])
    db.insert("INSERT INTO sum_formula (id, db_id, sf) VALUES (%s, %s, %s)",
              rows=[(1, 0, 'H2O')])
    db.insert((
        "INSERT INTO iso_image_metrics (job_id, db_id, sf, adduct, iso_image_ids) "
        "VALUES (%s, %s, %s, %s, %s)"),
              rows=[(0, 0, 'H2O', '+H', ['iso_image_1_id', 'iso_image_2_id'])])
    db.close()
コード例 #22
0
    def test_annotate_ds(self, test_db, sm_config, ds_config):
        es_mock = MagicMock(spec=ESExporter)
        db = DB(sm_config['db'])
        try:
            manager = create_daemon_man(sm_config, db=db, es=es_mock)

            ds_id = '2000-01-01'
            ds_name = 'ds_name'
            input_path = 'input_path'
            upload_dt = datetime.now()
            metadata = {}
            ds = create_ds(ds_id=ds_id, ds_name=ds_name, input_path=input_path, upload_dt=upload_dt,
                           metadata=metadata, ds_config=ds_config)

            manager.annotate(ds, search_job_factory=self.SearchJob)

            DS_SEL = 'select name, input_path, upload_dt, metadata, config from dataset where id=%s'
            assert db.select_one(DS_SEL, params=(ds_id,)) == (ds_name, input_path, upload_dt, metadata, ds_config)
        finally:
            db.close()
コード例 #23
0
def test_db(sm_config, request):
    db_config = dict(**sm_config['db'])
    db_config['database'] = 'postgres'

    db = DB(db_config, autocommit=True)
    db.alter('DROP DATABASE IF EXISTS sm_test')
    db.alter('CREATE DATABASE sm_test')
    db.close()

    local('psql -h {} -U {} sm_test < {}'.format(
        sm_config['db']['host'], sm_config['db']['user'],
        Path(proj_root()) / 'scripts/create_schema.sql'))

    def fin():
        db = DB(db_config, autocommit=True)
        try:
            db.alter('DROP DATABASE IF EXISTS sm_test')
        except Exception as e:
            logging.getLogger('engine').warning(
                'Drop sm_test database failed: %s', e)
        finally:
            db.close()

    request.addfinalizer(fin)
コード例 #24
0
ファイル: search_job.py プロジェクト: metaspace2020/offsample
class SearchJob(object):
    """ Main class responsible for molecule search. Uses other modules of the engine.

    Args
    ----
    no_clean : bool
        Don't delete interim data files
    """
    def __init__(self, img_store=None, no_clean=False):
        self.no_clean = no_clean
        self._img_store = img_store

        self._job_id = None
        self._sc = None
        self._db = None
        self._ds = None
        self._ds_reader = None
        self._status_queue = None
        self._fdr = None
        self._wd_manager = None
        self._es = None

        self._sm_config = SMConfig.get_conf()

        logger.debug('Using SM config:\n%s', pformat(self._sm_config))

    def _configure_spark(self):
        logger.info('Configuring Spark')
        sconf = SparkConf()
        for prop, value in self._sm_config['spark'].items():
            if prop.startswith('spark.'):
                sconf.set(prop, value)

        if 'aws' in self._sm_config:
            sconf.set("spark.hadoop.fs.s3a.access.key",
                      self._sm_config['aws']['aws_access_key_id'])
            sconf.set("spark.hadoop.fs.s3a.secret.key",
                      self._sm_config['aws']['aws_secret_access_key'])
            sconf.set("spark.hadoop.fs.s3a.impl",
                      "org.apache.hadoop.fs.s3a.S3AFileSystem")
            sconf.set(
                "spark.hadoop.fs.s3a.endpoint", "s3.{}.amazonaws.com".format(
                    self._sm_config['aws']['aws_region']))

        self._sc = SparkContext(master=self._sm_config['spark']['master'],
                                conf=sconf,
                                appName='SM engine')

    def _init_db(self):
        logger.info('Connecting to the DB')
        self._db = DB(self._sm_config['db'])

    def store_job_meta(self, mol_db_id):
        """ Store search job metadata in the database """
        logger.info('Storing job metadata')
        rows = [(mol_db_id, self._ds.id, JobStatus.RUNNING,
                 datetime.now().strftime('%Y-%m-%d %H:%M:%S'))]
        self._job_id = self._db.insert_return(JOB_INS, rows=rows)[0]

    def _run_annotation_job(self, mol_db):
        try:
            self.store_job_meta(mol_db.id)
            mol_db.set_job_id(self._job_id)

            logger.info(
                "Running new job ds_id: %s, ds_name: %s, db_name: %s, db_version: %s",
                self._ds.id, self._ds.name, mol_db.name, mol_db.version)

            target_adducts = self._ds.config['isotope_generation']['adducts']
            self._fdr = FDR(job_id=self._job_id,
                            decoy_sample_size=20,
                            target_adducts=target_adducts,
                            db=self._db)

            isocalc = IsocalcWrapper(self._ds.config['isotope_generation'])
            centroids_gen = IonCentroidsGenerator(sc=self._sc,
                                                  moldb_name=mol_db.name,
                                                  isocalc=isocalc)
            polarity = self._ds.config['isotope_generation']['charge'][
                'polarity']
            all_adducts = list(
                set(self._sm_config['defaults']['adducts'][polarity])
                | set(DECOY_ADDUCTS))
            centroids_gen.generate_if_not_exist(isocalc=isocalc,
                                                sfs=mol_db.sfs,
                                                adducts=all_adducts)
            target_ions = centroids_gen.ions(target_adducts)
            self._fdr.decoy_adducts_selection(target_ions)

            search_alg = MSMBasicSearch(sc=self._sc,
                                        ds=self._ds,
                                        ds_reader=self._ds_reader,
                                        mol_db=mol_db,
                                        centr_gen=centroids_gen,
                                        fdr=self._fdr,
                                        ds_config=self._ds.config)
            ion_metrics_df, ion_iso_images = search_alg.search()

            search_results = SearchResults(mol_db.id, self._job_id,
                                           search_alg.metrics.keys())
            mask = self._ds_reader.get_2d_sample_area_mask()
            img_store_type = self._ds.get_ion_img_storage_type(self._db)
            search_results.store(ion_metrics_df, ion_iso_images, mask,
                                 self._db, self._img_store, img_store_type)
        except Exception as e:
            self._db.alter(
                JOB_UPD_STATUS_FINISH,
                params=(JobStatus.FAILED,
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        self._job_id))
            msg = 'Job failed(ds_id={}, mol_db={}): {}'.format(
                self._ds.id, mol_db, str(e))
            raise JobFailedError(msg) from e
        else:
            self._db.alter(
                JOB_UPD_STATUS_FINISH,
                params=(JobStatus.FINISHED,
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        self._job_id))

    def _remove_annotation_job(self, mol_db):
        logger.info(
            "Removing job results ds_id: %s, ds_name: %s, db_name: %s, db_version: %s",
            self._ds.id, self._ds.name, mol_db.name, mol_db.version)
        self._db.alter('DELETE FROM job WHERE ds_id = %s and db_id = %s',
                       params=(self._ds.id, mol_db.id))
        self._es.delete_ds(self._ds.id, mol_db)

    def _moldb_ids(self):
        completed_moldb_ids = {
            db_id
            for (_, db_id) in self._db.select(JOB_ID_MOLDB_ID_SEL,
                                              params=(self._ds.id, ))
        }
        new_moldb_ids = {
            MolecularDB(name=moldb_name).id
            for moldb_name in self._ds.config['databases']
        }
        return completed_moldb_ids, new_moldb_ids

    def _save_data_from_raw_ms_file(self):
        ms_file_type_config = SMConfig.get_ms_file_handler(
            self._wd_manager.local_dir.ms_file_path)
        acq_geometry_factory_module = ms_file_type_config[
            'acq_geometry_factory']
        acq_geometry_factory = getattr(
            import_module(acq_geometry_factory_module['path']),
            acq_geometry_factory_module['name'])

        acq_geometry = acq_geometry_factory(
            self._wd_manager.local_dir.ms_file_path).create()
        self._ds.save_acq_geometry(self._db, acq_geometry)

        self._ds.save_ion_img_storage_type(
            self._db, ms_file_type_config['img_storage_type'])

    def run(self, ds):
        """ Entry point of the engine. Molecule search is completed in several steps:
            * Copying input data to the engine work dir
            * Conversion input mass spec files to plain text format. One line - one spectrum data
            * Generation and saving to the database theoretical peaks for all formulas from the molecule database
            * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner.
            * Saving results (isotope images and their metrics of quality for each putative molecule) to the database

        Args
        ----
            ds : sm.engine.dataset_manager.Dataset
        """
        try:
            logger.info('*' * 150)
            start = time.time()

            self._init_db()
            self._es = ESExporter(self._db)
            self._ds = ds

            if self._sm_config['rabbitmq']:
                self._status_queue = QueuePublisher(
                    config=self._sm_config['rabbitmq'],
                    qdesc=SM_DS_STATUS,
                    logger=logger)
            else:
                self._status_queue = None

            self._wd_manager = WorkDirManager(ds.id)
            self._configure_spark()

            if not self.no_clean:
                self._wd_manager.clean()

            self._ds_reader = DatasetReader(self._ds.input_path, self._sc,
                                            self._wd_manager)
            self._ds_reader.copy_convert_input_data()

            self._save_data_from_raw_ms_file()
            self._img_store.storage_type = self._ds.get_ion_img_storage_type(
                self._db)

            logger.info('Dataset config:\n%s', pformat(self._ds.config))

            completed_moldb_ids, new_moldb_ids = self._moldb_ids()
            for moldb_id in completed_moldb_ids.symmetric_difference(
                    new_moldb_ids):  # ignore ids present in both sets
                mol_db = MolecularDB(
                    id=moldb_id,
                    db=self._db,
                    iso_gen_config=self._ds.config['isotope_generation'])
                if moldb_id not in new_moldb_ids:
                    self._remove_annotation_job(mol_db)
                elif moldb_id not in completed_moldb_ids:
                    self._run_annotation_job(mol_db)

            logger.info("All done!")
            time_spent = time.time() - start
            logger.info('Time spent: %d mins %d secs',
                        *divmod(int(round(time_spent)), 60))
        finally:
            if self._sc:
                self._sc.stop()
            if self._db:
                self._db.close()
            if self._wd_manager and not self.no_clean:
                self._wd_manager.clean()
            logger.info('*' * 150)
コード例 #25
0
ファイル: sm_daemons.py プロジェクト: metaspace2020/offsample
class SMAnnotateDaemon(object):
    """ Reads messages from annotation queue and starts annotation jobs
    """
    logger = logging.getLogger('annotate-daemon')

    def __init__(self, manager, annot_qdesc, upd_qdesc, poll_interval=1):
        self._sm_config = SMConfig.get_conf()
        self._stopped = False
        self._annot_queue_consumer = QueueConsumer(
            config=self._sm_config['rabbitmq'],
            qdesc=annot_qdesc,
            callback=self._callback,
            on_success=self._on_success,
            on_failure=self._on_failure,
            logger=self.logger,
            poll_interval=poll_interval)
        self._upd_queue_pub = QueuePublisher(
            config=self._sm_config['rabbitmq'],
            qdesc=upd_qdesc,
            logger=self.logger)

        self._db = DB(self._sm_config['db'])
        self._manager = manager

    def _send_email(self, email, subj, body):
        try:
            cred_dict = dict(
                aws_access_key_id=self._sm_config['aws']['aws_access_key_id'],
                aws_secret_access_key=self._sm_config['aws']
                ['aws_secret_access_key'])
            ses = boto3.client('ses', 'eu-west-1', **cred_dict)
            resp = ses.send_email(Source='*****@*****.**',
                                  Destination={'ToAddresses': [email]},
                                  Message={
                                      'Subject': {
                                          'Data': subj
                                      },
                                      'Body': {
                                          'Text': {
                                              'Data': body
                                          }
                                      }
                                  })
        except Exception as e:
            self.logger.warning(f'Send email exception {e} for {email}')
        else:
            if resp['ResponseMetadata']['HTTPStatusCode'] == 200:
                self.logger.info(
                    f'Email with "{subj}" subject was sent to {email}')
            else:
                self.logger.warning(f'SEM failed to send email to {email}')

    def _on_success(self, msg):
        ds = Dataset.load(self._db, msg['ds_id'])
        ds.set_status(self._db, self._manager.es, self._manager.status_queue,
                      DatasetStatus.FINISHED)

        self.logger.info(f" SM annotate daemon: success")

        ds_name, _ = self._manager.fetch_ds_metadata(msg['ds_id'])
        msg['web_app_link'] = self._manager.create_web_app_link(msg)
        self._manager.post_to_slack(
            'dart', ' [v] Annotation succeeded: {}'.format(json.dumps(msg)))

        if msg.get('email'):
            email_body = (
                'Dear METASPACE user,\n\n'
                'Thank you for uploading the "{}" dataset to the METASPACE annotation service. '
                'We are pleased to inform you that the dataset has been processed and is available at {}.\n\n'
                'Best regards,\n'
                'METASPACE Team').format(ds_name, msg['web_app_link'])
            self._send_email(msg['email'],
                             'METASPACE service notification (SUCCESS)',
                             email_body)

    def _on_failure(self, msg):
        ds = Dataset.load(self._db, msg['ds_id'])
        ds.set_status(self._db, self._manager.es, self._manager.status_queue,
                      DatasetStatus.FAILED)

        self.logger.error(f" SM annotate daemon: failure", exc_info=True)

        ds_name, _ = self._manager.fetch_ds_metadata(msg['ds_id'])
        msg['web_app_link'] = self._manager.create_web_app_link(msg)
        self._manager.post_to_slack(
            'hankey', ' [x] Annotation failed: {}'.format(json.dumps(msg)))

        if msg.get('email'):
            email_body = (
                'Dear METASPACE user,\n\n'
                'We are sorry to inform you that there was a problem during processing of the "{}" dataset '
                'and it could not be annotated. '
                'If this is unexpected, please do not hesitate to contact us for support at [email protected]\n\n'
                'Best regards,\n'
                'METASPACE Team').format(ds_name)
            self._send_email(msg['email'],
                             'METASPACE service notification (FAILED)',
                             email_body)

    def _callback(self, msg):
        ds = Dataset.load(self._db, msg['ds_id'])
        ds.set_status(self._db, self._manager.es, self._manager.status_queue,
                      DatasetStatus.ANNOTATING)

        self.logger.info(f" SM annotate daemon received a message: {msg}")
        self._manager.post_to_slack(
            'new', " [v] New annotation message: {}".format(json.dumps(msg)))

        self._manager.annotate(ds=ds,
                               search_job_factory=SearchJob,
                               del_first=msg.get('del_first', False))

        upd_msg = {
            'ds_id': msg['ds_id'],
            'ds_name': msg['ds_name'],
            'action': 'update'
        }
        self._upd_queue_pub.publish(msg=upd_msg, priority=2)

    def start(self):
        self._stopped = False
        self._annot_queue_consumer.start()

    def stop(self):
        if not self._stopped:
            self._annot_queue_consumer.stop()
            self._annot_queue_consumer.join()
            self._stopped = True
        if self._db:
            self._db.close()
コード例 #26
0
class SearchJob(object):
    """ Main class responsible for molecule search. Uses other modules of the engine.

    Args
    ----
    no_clean : bool
        Don't delete interim data files
    """
    def __init__(self, img_store=None, no_clean=False):
        self.no_clean = no_clean
        self._img_store = img_store

        self._job_id = None
        self._sc = None
        self._db = None
        self._ds = None
        self._ds_reader = None
        self._status_queue = None
        self._fdr = None
        self._wd_manager = None
        self._es = None

        self._sm_config = SMConfig.get_conf()

        logger.debug('Using SM config:\n%s', pformat(self._sm_config))

    def _configure_spark(self):
        logger.info('Configuring Spark')
        sconf = SparkConf()
        for prop, value in self._sm_config['spark'].items():
            if prop.startswith('spark.'):
                sconf.set(prop, value)

        if 'aws' in self._sm_config:
            sconf.set("spark.hadoop.fs.s3a.access.key", self._sm_config['aws']['aws_access_key_id'])
            sconf.set("spark.hadoop.fs.s3a.secret.key", self._sm_config['aws']['aws_secret_access_key'])
            sconf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
            sconf.set("spark.hadoop.fs.s3a.endpoint", "s3.{}.amazonaws.com".format(self._sm_config['aws']['aws_region']))

        self._sc = SparkContext(master=self._sm_config['spark']['master'], conf=sconf, appName='SM engine')

    def _init_db(self):
        logger.info('Connecting to the DB')
        self._db = DB(self._sm_config['db'])

    def store_job_meta(self, mol_db_id):
        """ Store search job metadata in the database """
        logger.info('Storing job metadata')
        rows = [(mol_db_id, self._ds.id, 'STARTED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))]
        self._job_id = self._db.insert_return(JOB_INS, rows=rows)[0]

    def _run_annotation_job(self, mol_db):
        try:
            self.store_job_meta(mol_db.id)
            mol_db.set_job_id(self._job_id)

            logger.info("Running new job ds_id: %s, ds_name: %s, db_name: %s, db_version: %s",
                        self._ds.id, self._ds.name, mol_db.name, mol_db.version)

            target_adducts = self._ds.config['isotope_generation']['adducts']
            self._fdr = FDR(job_id=self._job_id,
                            decoy_sample_size=20,
                            target_adducts=target_adducts,
                            db=self._db)

            isocalc = IsocalcWrapper(self._ds.config['isotope_generation'])
            centroids_gen = IonCentroidsGenerator(sc=self._sc, moldb_name=mol_db.name, isocalc=isocalc)
            polarity = self._ds.config['isotope_generation']['charge']['polarity']
            all_adducts = list(set(self._sm_config['defaults']['adducts'][polarity]) | set(DECOY_ADDUCTS))
            centroids_gen.generate_if_not_exist(isocalc=isocalc,
                                                sfs=mol_db.sfs,
                                                adducts=all_adducts)
            target_ions = centroids_gen.ions(target_adducts)
            self._fdr.decoy_adducts_selection(target_ions)

            search_alg = MSMBasicSearch(sc=self._sc, ds=self._ds, ds_reader=self._ds_reader,
                                        mol_db=mol_db, centr_gen=centroids_gen,
                                        fdr=self._fdr, ds_config=self._ds.config)
            ion_metrics_df, ion_iso_images = search_alg.search()

            search_results = SearchResults(mol_db.id, self._job_id, search_alg.metrics.keys())
            mask = self._ds_reader.get_2d_sample_area_mask()
            img_store_type = self._ds.get_ion_img_storage_type(self._db)
            search_results.store(ion_metrics_df, ion_iso_images, mask, self._db, self._img_store, img_store_type)
        except Exception as e:
            self._db.alter(JOB_UPD, params=('FAILED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id))
            msg = 'Job failed(ds_id={}, mol_db={}): {}'.format(self._ds.id, mol_db, str(e))
            raise JobFailedError(msg) from e
        else:
            self._export_search_results_to_es(mol_db, isocalc)

    def _export_search_results_to_es(self, mol_db, isocalc):
        try:
            self._es.index_ds(self._ds.id, mol_db, isocalc)
        except Exception as e:
            self._db.alter(JOB_UPD, params=('FAILED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id))
            msg = 'Export to ES failed(ds_id={}, mol_db={}): {}'.format(self._ds.id, mol_db, str(e))
            raise ESExportFailedError(msg) from e
        else:
            self._db.alter(JOB_UPD, params=('FINISHED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id))

    def _remove_annotation_job(self, mol_db):
        logger.info("Removing job results ds_id: %s, ds_name: %s, db_name: %s, db_version: %s",
                    self._ds.id, self._ds.name, mol_db.name, mol_db.version)
        self._db.alter('DELETE FROM job WHERE ds_id = %s and db_id = %s', params=(self._ds.id, mol_db.id))
        self._es.delete_ds(self._ds.id, mol_db)

    def _moldb_ids(self):
        moldb_service = MolDBServiceWrapper(self._sm_config['services']['mol_db'])
        completed_moldb_ids = {moldb_service.find_db_by_id(db_id)['id']
                               for (_, db_id) in self._db.select(JOB_ID_MOLDB_ID_SEL, params=(self._ds.id,))}
        new_moldb_ids = {moldb_service.find_db_by_name_version(moldb_name)[0]['id']
                         for moldb_name in self._ds.config['databases']}
        return completed_moldb_ids, new_moldb_ids

    def _save_data_from_raw_ms_file(self):
        ms_file_type_config = SMConfig.get_ms_file_handler(self._wd_manager.local_dir.ms_file_path)
        acq_geometry_factory_module = ms_file_type_config['acq_geometry_factory']
        acq_geometry_factory = getattr(import_module(acq_geometry_factory_module['path']),
                                                acq_geometry_factory_module['name'])

        acq_geometry = acq_geometry_factory(self._wd_manager.local_dir.ms_file_path).create()
        self._ds.save_acq_geometry(self._db, acq_geometry)

        self._ds.save_ion_img_storage_type(self._db, ms_file_type_config['img_storage_type'])

    def run(self, ds):
        """ Entry point of the engine. Molecule search is completed in several steps:
            * Copying input data to the engine work dir
            * Conversion input mass spec files to plain text format. One line - one spectrum data
            * Generation and saving to the database theoretical peaks for all formulas from the molecule database
            * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner.
            * Saving results (isotope images and their metrics of quality for each putative molecule) to the database

        Args
        ----
            ds : sm.engine.dataset_manager.Dataset
        """
        try:
            start = time.time()

            self._init_db()
            self._es = ESExporter(self._db)
            self._ds = ds

            if self._sm_config['rabbitmq']:
                self._status_queue = QueuePublisher(config=self._sm_config['rabbitmq'],
                                                    qdesc=SM_DS_STATUS,
                                                    logger=logger)
            else:
                self._status_queue = None
            ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.STARTED)

            self._wd_manager = WorkDirManager(ds.id)
            self._configure_spark()

            if not self.no_clean:
                self._wd_manager.clean()

            self._ds_reader = DatasetReader(self._ds.input_path, self._sc, self._wd_manager)
            self._ds_reader.copy_convert_input_data()

            self._save_data_from_raw_ms_file()
            self._img_store.storage_type = self._ds.get_ion_img_storage_type(self._db)

            ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.STARTED)

            logger.info('Dataset config:\n%s', pformat(self._ds.config))

            completed_moldb_ids, new_moldb_ids = self._moldb_ids()
            for moldb_id in completed_moldb_ids.symmetric_difference(new_moldb_ids):  # ignore ids present in both sets
                mol_db = MolecularDB(id=moldb_id, db=self._db,
                                     iso_gen_config=self._ds.config['isotope_generation'])
                if moldb_id not in new_moldb_ids:
                    self._remove_annotation_job(mol_db)
                elif moldb_id not in completed_moldb_ids:
                    self._run_annotation_job(mol_db)

            ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.FINISHED)

            logger.info("All done!")
            time_spent = time.time() - start
            logger.info('Time spent: %d mins %d secs', *divmod(int(round(time_spent)), 60))
        except Exception as e:
            if self._ds:
                ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.FAILED)
            logger.error(e, exc_info=True)
            raise
        finally:
            if self._sc:
                self._sc.stop()
            if self._db:
                self._db.close()
            if self._wd_manager and not self.no_clean:
                self._wd_manager.clean()
            logger.info('*' * 150)
コード例 #27
0
def test_sm_daemons(get_compute_img_metrics_mock, filter_sf_metrics_mock,
                    post_images_to_annot_service_mock,
                    MolDBServiceWrapperMock,
                    sm_config, test_db, es_dsl_search, clean_isotope_storage):
    init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock)

    get_compute_img_metrics_mock.return_value = lambda *args: (0.9, 0.9, 0.9, [100.], [0], [10.])
    filter_sf_metrics_mock.side_effect = lambda x: x

    url_dict = {
        'iso_image_ids': ['iso_image_1', None, None, None]
    }
    post_images_to_annot_service_mock.return_value = {
        35: url_dict,
        44: url_dict
    }

    db = DB(sm_config['db'])
    es = ESExporter(db)
    annotate_daemon = None
    update_daemon = None

    try:
        ds_config_str = open(ds_config_path).read()
        upload_dt = datetime.now()
        ds_id = '2000-01-01_00h00m'
        db.insert(Dataset.DS_INSERT, [{
            'id': ds_id,
            'name': test_ds_name,
            'input_path': input_dir_path,
            'upload_dt': upload_dt,
            'metadata': '{"Data_Type": "Imaging MS"}',
            'config': ds_config_str,
            'status': DatasetStatus.QUEUED,
            'is_public': True,
            'mol_dbs': ['HMDB-v4'],
            'adducts': ['+H', '+Na', '+K'],
            'ion_img_storage': 'fs'
        }])

        ds = Dataset.load(db, ds_id)
        queue_pub.publish({'ds_id': ds.id, 'ds_name': ds.name, 'action': 'annotate'})

        run_daemons(db, es)

        # dataset table asserts
        rows = db.select('SELECT id, name, input_path, upload_dt, status from dataset')
        input_path = join(dirname(__file__), 'data', test_ds_name)
        assert len(rows) == 1
        assert rows[0] == (ds_id, test_ds_name, input_path, upload_dt, DatasetStatus.FINISHED)

        # ms acquisition geometry asserts
        rows = db.select('SELECT acq_geometry from dataset')
        assert len(rows) == 1
        assert rows[0][0] == ds.get_acq_geometry(db)
        assert rows[0][0] == {
            ACQ_GEOMETRY_KEYS.LENGTH_UNIT: 'nm',
            ACQ_GEOMETRY_KEYS.AcqGridSection.section_name: {
                ACQ_GEOMETRY_KEYS.AcqGridSection.REGULAR_GRID: True,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_X : 3,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_Y : 3,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_X : 100,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_Y : 100
            },
            ACQ_GEOMETRY_KEYS.PixelSizeSection.section_name: {
                ACQ_GEOMETRY_KEYS.PixelSizeSection.REGULAR_SIZE: True,
                ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_X : 100,
                ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_Y : 100
            }
        }

        # job table asserts
        rows = db.select('SELECT db_id, ds_id, status, start, finish from job')
        assert len(rows) == 1
        db_id, ds_id, status, start, finish = rows[0]
        assert (db_id, ds_id, status) == (0, '2000-01-01_00h00m', JobStatus.FINISHED)
        assert start < finish

        # image metrics asserts
        rows = db.select(('SELECT db_id, sf, adduct, stats, iso_image_ids '
                          'FROM iso_image_metrics '
                          'ORDER BY sf, adduct'))

        assert rows[0] == (0, 'C12H24O', '+K', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9,
                                                'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]},
                           ['iso_image_1', None, None, None])
        assert rows[1] == (0, 'C12H24O', '+Na', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9,
                                                 'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]},
                           ['iso_image_1', None, None, None])

        time.sleep(1)  # Waiting for ES
        # ES asserts
        ds_docs = es_dsl_search.query('term', _type='dataset').execute().to_dict()['hits']['hits']
        assert 1 == len(ds_docs)
        ann_docs = es_dsl_search.query('term', _type='annotation').execute().to_dict()['hits']['hits']
        assert len(ann_docs) == len(rows)
        for doc in ann_docs:
            assert doc['_id'].startswith(ds_id)

    finally:
        db.close()
        if annotate_daemon:
            annotate_daemon.stop()
        if update_daemon:
            update_daemon.stop()
        with warn_only():
            local('rm -rf {}'.format(data_dir_path))
def test_search_job_imzml_example(get_compute_img_metrics_mock, filter_sf_metrics_mock,
                                  post_images_to_annot_service_mock, MolDBServiceWrapperMock, MolDBServiceWrapperMock2,
                                  sm_config, create_fill_sm_database, es_dsl_search, clean_isotope_storage):
    init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock)
    init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock2)

    get_compute_img_metrics_mock.return_value = lambda *args: (0.9, 0.9, 0.9, [100.], [0], [10.])
    filter_sf_metrics_mock.side_effect = lambda x: x

    url_dict = {
        'iso_image_ids': ['iso_image_1', None, None, None]
    }
    post_images_to_annot_service_mock.return_value = {
        35: url_dict,
        44: url_dict
    }

    db = DB(sm_config['db'])

    try:
        ds_config_str = open(ds_config_path).read()
        upload_dt = datetime.now()
        ds_id = '2000-01-01_00h00m'
        db.insert(Dataset.DS_INSERT, [{
            'id': ds_id,
            'name': test_ds_name,
            'input_path': input_dir_path,
            'upload_dt': upload_dt,
            'metadata': '{}',
            'config': ds_config_str,
            'status': DatasetStatus.QUEUED,
            'is_public': True,
            'mol_dbs': ['HMDB-v4'],
            'adducts': ['+H', '+Na', '+K'],
            'ion_img_storage': 'fs'
        }])

        img_store = ImageStoreServiceWrapper(sm_config['services']['img_service_url'])
        job = SearchJob(img_store=img_store)
        job._sm_config['rabbitmq'] = {}  # avoid talking to RabbitMQ during the test
        ds = Dataset.load(db, ds_id)
        job.run(ds)

        # dataset table asserts
        rows = db.select('SELECT id, name, input_path, upload_dt, status from dataset')
        input_path = join(dirname(__file__), 'data', test_ds_name)
        assert len(rows) == 1
        assert rows[0] == (ds_id, test_ds_name, input_path, upload_dt, DatasetStatus.FINISHED)

        # ms acquisition geometry asserts
        rows = db.select('SELECT acq_geometry from dataset')
        assert len(rows) == 1
        assert rows[0][0] == ds.get_acq_geometry(db)
        assert rows[0][0] == {
            ACQ_GEOMETRY_KEYS.LENGTH_UNIT: 'nm',
            ACQ_GEOMETRY_KEYS.AcqGridSection.section_name: {
                ACQ_GEOMETRY_KEYS.AcqGridSection.REGULAR_GRID: True,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_X : 3,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_Y : 3,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_X : 100,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_Y : 100
            },
            ACQ_GEOMETRY_KEYS.PixelSizeSection.section_name: {
                ACQ_GEOMETRY_KEYS.PixelSizeSection.REGULAR_SIZE: True,
                ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_X : 100,
                ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_Y : 100
            }
        }

        # job table asserts
        rows = db.select('SELECT db_id, ds_id, status, start, finish from job')
        assert len(rows) == 1
        db_id, ds_id, status, start, finish = rows[0]
        assert (db_id, ds_id, status) == (0, '2000-01-01_00h00m', 'FINISHED')
        assert start < finish

        # image metrics asserts
        rows = db.select(('SELECT db_id, sf, adduct, stats, iso_image_ids '
                          'FROM iso_image_metrics '
                          'ORDER BY sf, adduct'))

        assert rows[0] == (0, 'C12H24O', '+K', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9,
                                                'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]},
                           ['iso_image_1', None, None, None])
        assert rows[1] == (0, 'C12H24O', '+Na', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9,
                                                 'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]},
                           ['iso_image_1', None, None, None])

        time.sleep(1)  # Waiting for ES
        # ES asserts
        ds_docs = es_dsl_search.query('term', _type='dataset').execute().to_dict()['hits']['hits']
        assert 1 == len(ds_docs)
        ann_docs = es_dsl_search.query('term', _type='annotation').execute().to_dict()['hits']['hits']
        assert len(ann_docs) == len(rows)
        for doc in ann_docs:
            assert doc['_id'].startswith(ds_id)

    finally:
        db.close()
        with warn_only():
            local('rm -rf {}'.format(data_dir_path))
コード例 #29
0
ファイル: util.py プロジェクト: frulo/SM_distributed
 def fin():
     db_config = dict(database='postgres', user='******', host='localhost', password='******')
     db = DB(db_config, autocommit=True)
     db.alter('DROP DATABASE IF EXISTS sm_test')
     db.close()
コード例 #30
0
ファイル: sm_daemons.py プロジェクト: metaspace2020/offsample
class SMUpdateDaemon(object):
    """ Reads messages from update queue and does updates/deletes
    """
    logger = logging.getLogger('update-daemon')

    def __init__(self, manager, update_qdesc, poll_interval=1):
        self._manager = manager
        self._sm_config = SMConfig.get_conf()
        self._db = DB(self._sm_config['db'])
        self._update_queue_cons = QueueConsumer(
            config=self._sm_config['rabbitmq'],
            qdesc=update_qdesc,
            callback=self._callback,
            on_success=self._on_success,
            on_failure=self._on_failure,
            logger=self.logger,
            poll_interval=poll_interval)
        self._status_queue_pub = QueuePublisher(
            config=self._sm_config['rabbitmq'],
            qdesc=SM_DS_STATUS,
            logger=self.logger)
        self._stopped = False

    def _post_to_slack(self, msg):
        if msg['action'] == 'update':
            msg['web_app_link'] = self._manager.create_web_app_link(msg)
            self._manager.post_to_slack(
                'dart', f' [v] Update succeeded: {json.dumps(msg)}')
        elif msg['action'] == 'delete':
            self._manager.post_to_slack(
                'dart', f' [v] Delete succeeded: {json.dumps(msg)}')

    def _on_success(self, msg):
        ds = Dataset.load(self._db, msg['ds_id'])
        ds.set_status(self._db, self._manager.es, self._manager.status_queue,
                      DatasetStatus.FINISHED)

        self.logger.info(f" SM update daemon: success")
        self._post_to_slack(msg)

    def _on_failure(self, msg):
        ds = Dataset.load(self._db, msg['ds_id'])
        ds.set_status(self._db, self._manager.es, self._manager.status_queue,
                      DatasetStatus.FAILED)

        self.logger.error(f" SM update daemon: failure", exc_info=True)
        self._post_to_slack(msg)

    def _callback(self, msg):
        ds = Dataset.load(self._db, msg['ds_id'])
        ds.set_status(self._db, self._manager.es, self._manager.status_queue,
                      DatasetStatus.INDEXING)

        self.logger.info(f' SM update daemon received a message: {msg}')
        self._manager.post_to_slack(
            'new', f" [v] New {msg['action']} message: {json.dumps(msg)}")

        if msg['action'] == 'update':
            self._manager.index(ds=ds)
        elif msg['action'] == 'delete':
            self._manager.delete(ds=ds)
        else:
            raise Exception(f"Wrong action: {msg['action']}")

    def start(self):
        self._stopped = False
        self._update_queue_cons.start()

    def stop(self):
        if not self._stopped:
            self._update_queue_cons.stop()
            self._update_queue_cons.join()
            self._stopped = True
        if self._db:
            self._db.close()
コード例 #31
0
class SearchJob(object):
    """ Main class responsible for molecule search. Uses other modules of the engine.

    Args
    ----------
    ds_name : string
        A dataset short name
    """
    def __init__(self, client_email, ds_name):
        self.sm_config = SMConfig.get_conf()
        self.client_email = client_email
        self.ds_name = ds_name
        self.ds_id = None
        self.job_id = None
        self.sc = None
        self.db = None
        self.ds = None
        self.fdr = None
        self.formulas = None
        self.ds_config = None
        self.wd_manager = None

    def _read_ds_config(self):
        with open(self.wd_manager.ds_config_path) as f:
            self.ds_config = json.load(f)

    def _configure_spark(self):
        logger.info('Configuring Spark')
        sconf = SparkConf()
        for prop, value in self.sm_config['spark'].iteritems():
            if prop.startswith('spark.'):
                sconf.set(prop, value)

        if 'aws' in self.sm_config:
            sconf.set("spark.hadoop.fs.s3a.access.key",
                      self.sm_config['aws']['aws_access_key_id'])
            sconf.set("spark.hadoop.fs.s3a.secret.key",
                      self.sm_config['aws']['aws_secret_access_key'])
            sconf.set("spark.hadoop.fs.s3a.impl",
                      "org.apache.hadoop.fs.s3a.S3AFileSystem")

        # sconf.set("spark.python.profile", "true")
        self.sc = SparkContext(master=self.sm_config['spark']['master'],
                               conf=sconf,
                               appName='SM engine')
        if not self.sm_config['spark']['master'].startswith('local'):
            self.sc.addPyFile(join(local_path(proj_root()), 'sm.zip'))

    def _init_db(self):
        logger.info('Connecting to the DB')
        self.db = DB(self.sm_config['db'])
        self.sf_db_id = self.db.select_one(
            DB_ID_SEL, self.ds_config['database']['name'])[0]

    def store_job_meta(self):
        """ Store search job metadata in the database """
        logger.info('Storing job metadata')
        self.ds_id = int(self.db.select_one(DS_ID_SEL, self.ds_name)[0])
        self.job_id = self.ds_id
        self.db.alter(DEL_JOB_SQL, self.job_id)
        rows = [(self.job_id, self.sf_db_id, self.ds_id,
                 datetime.now().strftime('%Y-%m-%d %H:%M:%S'))]
        self.db.insert(JOB_INS, rows)

        rows = [(self.job_id, adduct)
                for adduct in self.ds_config['isotope_generation']['adducts']]
        self.db.insert(ADDUCT_INS, rows)

    def run(self, input_path, ds_config_path, clean=False):
        """ Entry point of the engine. Molecule search is completed in several steps:
         * Copying input data to the engine work dir
         * Conversion input data (imzML+ibd) to plain text format. One line - one spectrum data
         * Generation and saving to the database theoretical peaks for all formulas from the molecule database
         * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner.
         * Saving results (isotope images and their metrics of quality for each putative molecule) to the database

        Args
        -------
        input_path : string
            Path to the dataset folder with .imzML and .ibd files
        ds_config_path: string
            Path to the dataset config file
        clean : bool
            Clean all interim data files before starting molecule search
        """
        try:
            self.wd_manager = WorkDirManager(self.ds_name)
            if clean:
                self.wd_manager.clean()

            self.wd_manager.copy_input_data(input_path, ds_config_path)

            self._read_ds_config()
            logger.info('Dataset config:\n%s', pformat(self.ds_config))

            self._configure_spark()
            self._init_db()

            if not self.wd_manager.exists(self.wd_manager.txt_path):
                imzml_converter = ImzmlTxtConverter(
                    self.ds_name, self.wd_manager.local_dir.imzml_path,
                    self.wd_manager.local_dir.txt_path,
                    self.wd_manager.local_dir.coord_path)
                imzml_converter.convert()

                if not self.wd_manager.local_fs_only:
                    self.wd_manager.upload_to_remote()

            self.ds = Dataset(self.sc, self.ds_name, self.client_email,
                              input_path, self.ds_config, self.wd_manager,
                              self.db)
            self.ds.save_ds_meta()

            self.store_job_meta()

            theor_peaks_gen = TheorPeaksGenerator(self.sc, self.sm_config,
                                                  self.ds_config)
            theor_peaks_gen.run()

            target_adducts = self.ds_config['isotope_generation']['adducts']
            self.fdr = FDR(self.job_id,
                           self.sf_db_id,
                           decoy_sample_size=20,
                           target_adducts=target_adducts,
                           db=self.db)
            self.fdr.decoy_adduct_selection()
            self.formulas = FormulasSegm(self.job_id, self.sf_db_id,
                                         self.ds_config, self.db)

            # search_alg = MSMBasicSearch(self.sc, self.ds, self.formulas, self.fdr, self.ds_config)
            search_alg = MSMExtraFeats(self.sc, self.ds, self.formulas,
                                       self.fdr, self.ds_config)
            sf_metrics_df, sf_iso_images = search_alg.search()

            search_results = SearchResults(
                self.sf_db_id, self.ds_id, self.job_id, self.ds_name,
                self.formulas.get_sf_adduct_peaksn(), self.db, self.sm_config,
                self.ds_config)
            search_results.sf_metrics_df = sf_metrics_df
            search_results.sf_iso_images = sf_iso_images
            search_results.metrics = search_alg.metrics
            search_results.nrows, search_results.ncols = self.ds.get_dims()
            search_results.store()

            es = ESExporter(self.sm_config)
            es.index_ds(self.db, self.ds_name,
                        self.ds_config['database']['name'])

        except Exception:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            logger.error('\n'.join(
                traceback.format_exception(exc_type, exc_value,
                                           exc_traceback)))
        finally:
            if self.sc:
                # self.sc.show_profiles()
                self.sc.stop()
            if self.db:
                self.db.close()