def test_search_job_imzml_example_es_export_fails(get_compute_img_metrics_mock, filter_sf_metrics_mock, post_images_to_annot_service_mock, MolDBServiceWrapperMock, MolDBServiceWrapperMock2, sm_config, create_fill_sm_database, es_dsl_search, clean_isotope_storage): init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock) init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock2) get_compute_img_metrics_mock.return_value = lambda *args: (0.9, 0.9, 0.9, [100.], [0], [10.]) filter_sf_metrics_mock.side_effect = lambda x: x url_dict = { 'iso_image_ids': ['iso_image_1', None, None, None] } post_images_to_annot_service_mock.return_value = { 35: url_dict, 44: url_dict } db = DB(sm_config['db']) def throw_exception_function(*args): raise Exception('Test') try: ds_id = '2000-01-01_00h00m' upload_dt = datetime.now() ds_config_str = open(ds_config_path).read() db.insert(Dataset.DS_INSERT, [{ 'id': ds_id, 'name': test_ds_name, 'input_path': input_dir_path, 'upload_dt': upload_dt, 'metadata': '{}', 'config': ds_config_str, 'status': DatasetStatus.QUEUED, 'is_public': True, 'mol_dbs': ['HMDB-v4'], 'adducts': ['+H', '+Na', '+K'], 'ion_img_storage': 'fs' }]) with patch('sm.engine.search_job.ESExporter.index_ds') as index_ds_mock: index_ds_mock.side_effect = throw_exception_function img_store = ImageStoreServiceWrapper(sm_config['services']['img_service_url']) job = SearchJob(img_store=img_store) ds = Dataset.load(db, ds_id) job.run(ds) except ESExportFailedError as e: assert e # dataset table asserts row = db.select_one('SELECT status from dataset') assert row[0] == 'FAILED' else: raise AssertionError('ESExportFailedError should be raised') finally: db.close() with warn_only(): local('rm -rf {}'.format(data_dir_path))
def test_save_ds_meta_ds_doesnt_exist(spark_context, create_test_db, drop_test_db, sm_config, ds_config): work_dir_man_mock = MagicMock(WorkDirManager) work_dir_man_mock.ds_coord_path = '/ds_path' work_dir_man_mock.txt_path = '/txt_path' SMConfig._config_dict = sm_config with patch('sm.engine.tests.util.SparkContext.textFile') as m: m.return_value = spark_context.parallelize([ '0,1,1\n', '1,100,200\n']) dataset = Dataset(spark_context, 'ds_name', '', ds_config, work_dir_man_mock, DB(sm_config['db'])) dataset.save_ds_meta() db = DB(sm_config['db']) ds_row = db.select_one('SELECT name, file_path, img_bounds, config from dataset') assert ds_row == ('ds_name', '/txt_path', {u'x': {u'min': 1, u'max': 100}, u'y': {u'min': 1, u'max': 200}}, ds_config) coord_row = db.select_one('SELECT xs, ys from coordinates') assert coord_row == ([1, 100], [1, 200]) db.close()
def test_save_ds_meta_ds_doesnt_exist(spark_context, create_test_db, drop_test_db, sm_config, ds_config): work_dir_man_mock = MagicMock(WorkDirManager) work_dir_man_mock.ds_coord_path = '/ds_path' work_dir_man_mock.txt_path = '/txt_path' SMConfig._config_dict = sm_config with patch('sm.engine.tests.util.SparkContext.textFile') as m: m.return_value = spark_context.parallelize(['0,1,1\n', '1,100,200\n']) dataset = Dataset(spark_context, 'ds_name', '', 'input_path', ds_config, work_dir_man_mock, DB(sm_config['db'])) dataset.save_ds_meta() db = DB(sm_config['db']) ds_row = db.select_one( 'SELECT name, file_path, img_bounds, config from dataset') assert ds_row == ('ds_name', 'input_path', { u'x': { u'min': 1, u'max': 100 }, u'y': { u'min': 1, u'max': 200 } }, ds_config) coord_row = db.select_one('SELECT xs, ys from coordinates') assert coord_row == ([1, 100], [1, 200]) db.close()
def test_save_sf_iso_images_correct_db_call(spark_context, create_fill_sm_database, sm_config, ds_config): sf_iso_imgs = spark_context.parallelize([((1, '+H'), [ csr_matrix([[100, 0, 0], [0, 0, 0]]), csr_matrix([[0, 0, 0], [0, 0, 10]]) ])]) sf_adduct_peaksn = [(1, '+H', 2)] res = SearchResults(0, 0, 0, 'ds_name', sf_adduct_peaksn, db_mock, sm_config, ds_config) res.sf_iso_images = sf_iso_imgs res.nrows, res.ncols = 2, 3 res.store_sf_iso_images() correct_rows = [(0, 0, 1, '+H', 0, [0], [100], 0, 100), (0, 0, 1, '+H', 1, [5], [10], 0, 10)] db = DB(sm_config['db']) try: rows = db.select(( 'SELECT job_id, db_id, sf_id, adduct, peak, pixel_inds, intensities, min_int, max_int ' 'FROM iso_image ' 'ORDER BY sf_id, adduct')) assert correct_rows == rows finally: db.close()
def fin(): db_config = dict(database='postgres', user='******', host='localhost', password='******') db = DB(db_config, autocommit=True) db.alter('DROP DATABASE IF EXISTS sm_test') db.close()
def store_iso_img_rows(row_it): db = DB(db_config) try: rows = list(row_it) if rows: db.insert(SF_ISO_IMGS_INS, rows) finally: db.close()
def create_test_db(): db_config = dict(database='postgres', user='******', host='localhost') db = DB(db_config, autocommit=True) db.alter('DROP DATABASE IF EXISTS sm_test') db.alter('CREATE DATABASE sm_test') db.close() local('psql -h localhost -U sm sm_test < {}'.format(join(proj_root(), 'scripts/create_schema.sql')))
def test_sm_daemons_annot_fails(get_compute_img_metrics_mock, filter_sf_metrics_mock, post_images_to_annot_service_mock, MolDBServiceWrapperMock, sm_config, test_db, es_dsl_search, clean_isotope_storage): init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock) def throw_exception_function(*args): raise Exception('Test') get_compute_img_metrics_mock.return_value = throw_exception_function filter_sf_metrics_mock.side_effect = lambda x: x url_dict = { 'iso_image_ids': ['iso_image_1', None, None, None] } post_images_to_annot_service_mock.return_value = { 35: url_dict, 44: url_dict } db = DB(sm_config['db']) es = ESExporter(db) annotate_daemon = None try: ds_id = '2000-01-01_00h00m' upload_dt = datetime.now() ds_config_str = open(ds_config_path).read() db.insert(Dataset.DS_INSERT, [{ 'id': ds_id, 'name': test_ds_name, 'input_path': input_dir_path, 'upload_dt': upload_dt, 'metadata': '{}', 'config': ds_config_str, 'status': DatasetStatus.QUEUED, 'is_public': True, 'mol_dbs': ['HMDB-v4'], 'adducts': ['+H', '+Na', '+K'], 'ion_img_storage': 'fs' }]) ds = Dataset.load(db, ds_id) queue_pub.publish({'ds_id': ds.id, 'ds_name': ds.name, 'action': 'annotate'}) run_daemons(db, es) # dataset and job tables asserts row = db.select_one('SELECT status from dataset') assert row[0] == 'FAILED' row = db.select_one('SELECT status from job') assert row[0] == 'FAILED' finally: db.close() if annotate_daemon: annotate_daemon.stop() with warn_only(): local('rm -rf {}'.format(data_dir_path))
def create_test_db(): db_config = dict(database='postgres', user='******', host='localhost') db = DB(db_config, autocommit=True) db.alter('DROP DATABASE IF EXISTS sm_test') db.alter('CREATE DATABASE sm_test') db.close() local('psql -h localhost -U sm sm_test < {}'.format( join(proj_root(), 'scripts/create_schema.sql')))
def fin(): db = DB(db_config, autocommit=True) try: db.alter('DROP DATABASE IF EXISTS sm_test') except Exception as e: logging.getLogger('engine').warning( 'Drop sm_test database failed: %s', e) finally: db.close()
def fill_test_db(create_test_db, drop_test_db): db_config = dict(database='sm_test', user='******', host='localhost', password='******') db = DB(db_config) try: db.alter('TRUNCATE dataset CASCADE') db.insert("INSERT INTO dataset VALUES (%s, %s, %s, %s, %s)", [(1, 'ds_name', '/ds_path', json.dumps({}), json.dumps({}))]) db.alter('TRUNCATE coordinates CASCADE') except: raise finally: db.close()
def test_theor_peaks_generator_run_failed_iso_peaks(create_fill_test_db, spark_context, sm_config, ds_config): ds_config["isotope_generation"]["adducts"] = ["+Na"] theor_peaks_gen = TheorPeaksGenerator(spark_context, sm_config, ds_config) theor_peaks_gen.isocalc_wrapper.isotope_peaks = lambda *args: Centroids([], []) theor_peaks_gen.run() db = DB(sm_config['db']) rows = db.select('SELECT * FROM theor_peaks') assert len(rows) == 1 db.close()
def test_search_job_imzml_example(get_compute_img_measures_mock, filter_sf_metrics_mock, create_fill_sm_database, sm_config): get_compute_img_measures_mock.return_value = lambda *args: (0.9, 0.9, 0.9) filter_sf_metrics_mock.side_effect = lambda x: x SMConfig._config_dict = sm_config db = DB(sm_config['db']) try: job = SearchJob(None, 'imzml_example_ds') job.run(input_dir_path, ds_config_path, clean=True) # dataset meta asserts rows = db.select("SELECT name, file_path, img_bounds from dataset") img_bounds = {u'y': {u'max': 3, u'min': 1}, u'x': {u'max': 3, u'min': 1}} file_path = join(dirname(__file__), 'data', 'imzml_example_ds') assert len(rows) == 1 assert rows[0] == (test_ds_name, file_path, img_bounds) # theoretical patterns asserts rows = db.select('SELECT db_id, sf_id, adduct, centr_mzs, centr_ints, prof_mzs, prof_ints ' 'FROM theor_peaks ' 'ORDER BY adduct') assert len(rows) == 3 + len(DECOY_ADDUCTS) for r in rows: assert r[3] and r[4] # image metrics asserts rows = db.select(('SELECT db_id, sf_id, adduct, peaks_n, stats FROM iso_image_metrics ' 'ORDER BY sf_id, adduct')) assert rows assert rows[0] assert tuple(rows[0][:2]) == (0, 10007) assert set(rows[0][4].keys()) == {'chaos', 'spatial', 'spectral'} # image asserts rows = db.select(('SELECT db_id, sf_id, adduct, peak, intensities, min_int, max_int ' 'FROM iso_image ' 'ORDER BY sf_id, adduct')) assert rows max_int = 0.0 for r in rows: max_int = max(max_int, r[-1]) assert tuple(r[:2]) == (0, 10007) assert max_int finally: db.close() with warn_only(): local('rm -rf {}'.format(data_dir_path))
def test_search_job_imzml_example(get_compute_img_measures_mock, create_fill_sm_database, sm_config): get_compute_img_measures_mock.return_value = lambda *args: (0.9, 0.9, 0.9) SMConfig._config_dict = sm_config db = DB(sm_config['db']) try: job = SearchJob(None, 'imzml_example_ds') job.run(input_dir_path, ds_config_path, clean=True) # dataset meta asserts rows = db.select("SELECT name, file_path, img_bounds from dataset") img_bounds = {u'y': {u'max': 3, u'min': 1}, u'x': {u'max': 3, u'min': 1}} file_path = 'file://' + join(data_dir_path, 'ds.txt') assert len(rows) == 1 assert rows[0] == (test_ds_name, file_path, img_bounds) # theoretical patterns asserts rows = db.select('SELECT db_id, sf_id, adduct, centr_mzs, centr_ints, prof_mzs, prof_ints ' 'FROM theor_peaks ' 'ORDER BY adduct') assert len(rows) == 3 + len(DECOY_ADDUCTS) for r in rows: assert r[3] and r[4] # image metrics asserts rows = db.select(('SELECT db_id, sf_id, adduct, peaks_n, stats FROM iso_image_metrics ' 'ORDER BY sf_id, adduct')) assert rows assert rows[0] assert tuple(rows[0][:2]) == (0, 10007) assert set(rows[0][4].keys()) == {'chaos', 'spatial', 'spectral'} # image asserts rows = db.select(('SELECT db_id, sf_id, adduct, peak, intensities, min_int, max_int ' 'FROM iso_image ' 'ORDER BY sf_id, adduct')) assert rows max_int = 0.0 for r in rows: max_int = max(max_int, r[-1]) assert tuple(r[:2]) == (0, 10007) assert max_int finally: db.close() with warn_only(): local('rm -rf {}'.format(data_dir_path))
def create_fill_sm_database(create_test_db, drop_test_db, sm_config): local('psql -h localhost -U sm sm_test < {}'.format(join(proj_dir_path, 'scripts/create_schema.sql'))) db = DB(sm_config['db']) try: db.insert('INSERT INTO formula_db VALUES (%s, %s, %s)', [(0, '2016-01-01', 'HMDB')]) db.insert('INSERT INTO formula VALUES (%s, %s, %s, %s, %s)', [(100, 0, '00001', 'compound_name', 'C12H24O')]) db.insert('INSERT INTO agg_formula VALUES (%s, %s, %s, %s, %s)', [(10007, 0, 'C12H24O', ['00001'], ['compound_name'])]) except: raise finally: db.close()
def create_fill_test_db(create_test_db, drop_test_db): db_config = dict(database='sm_test', user='******', host='localhost', password='******') db = DB(db_config) try: db.alter('TRUNCATE formula_db CASCADE') db.insert('INSERT INTO formula_db VALUES (%s, %s, %s)', [(0, '2016-01-01', 'HMDB')]) db.insert('INSERT INTO formula VALUES (%s, %s, %s, %s, %s)', [(9, 0, '04138', 'Au', 'Gold')]) db.insert('INSERT INTO agg_formula VALUES (%s, %s, %s, %s, %s)', [(9, 0, 'Au', ['04138'], ['Gold'])]) db.alter('TRUNCATE theor_peaks CASCADE') db.insert('INSERT INTO theor_peaks VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', [(0, 9, '+H', 0.01, 1, 10000, [100, 200], [10, 1], [], [])]) except: raise finally: db.close()
def create_fill_sm_database(create_test_db, drop_test_db, sm_config): proj_dir_path = dirname(dirname(__file__)) local('psql -h localhost -U sm sm_test < {}'.format( join(proj_dir_path, 'scripts/create_schema.sql'))) db = DB(sm_config['db']) try: db.insert('INSERT INTO dataset VALUES (%s, %s, %s, %s, %s, %s)', [(0, 'name', 0, 'fpath', json.dumps({}), json.dumps({}))]) db.insert('INSERT INTO job VALUES (%s, %s, %s, %s, %s, %s, %s, %s)', [(0, 0, 0, '', 0, 0, None, None)]) except: raise finally: db.close()
def test_theor_peaks_generator_run_1(create_fill_test_db, spark_context, sm_config, ds_config): ds_config["isotope_generation"]["adducts"] = ["+H", "+Na"] theor_peaks_gen = TheorPeaksGenerator(spark_context, sm_config, ds_config) theor_peaks_gen.isocalc_wrapper.isotope_peaks = lambda *args: Centroids([100., 200.], [10., 1.]) theor_peaks_gen.run() db = DB(sm_config['db']) rows = db.select(('SELECT db_id, sf_id, adduct, sigma, charge, pts_per_mz, centr_mzs, ' 'centr_ints, prof_mzs, prof_ints FROM theor_peaks ORDER BY sf_id, adduct')) assert len(rows) == 2 + 80 assert (filter(lambda r: r[2] == '+H', rows)[0] == (0, 9, '+H', 0.01, 1, 10000, [100., 200.], [10., 1.], [], [])) assert (filter(lambda r: r[2] == '+Na', rows)[0] == (0, 9, '+Na', 0.01, 1, 10000, [100., 200.], [10., 1.], [], [])) db.close()
def fill_db(test_db, sm_config, ds_config): upload_dt = '2000-01-01 00:00:00' ds_id = '2000-01-01' meta = {"meta": "data"} db = DB(sm_config['db']) db.insert( 'INSERT INTO dataset (id, name, input_path, upload_dt, metadata, config, ' 'status, is_public, mol_dbs, adducts) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', rows=[(ds_id, 'ds_name', 'input_path', upload_dt, json.dumps(meta), json.dumps(ds_config), 'FINISHED', True, ['HMDB-v4'], ['+H'])]) db.insert("INSERT INTO job (id, db_id, ds_id) VALUES (%s, %s, %s)", rows=[(0, 0, ds_id)]) db.insert("INSERT INTO sum_formula (id, db_id, sf) VALUES (%s, %s, %s)", rows=[(1, 0, 'H2O')]) db.insert(( "INSERT INTO iso_image_metrics (job_id, db_id, sf, adduct, iso_image_ids) " "VALUES (%s, %s, %s, %s, %s)"), rows=[(0, 0, 'H2O', '+H', ['iso_image_1_id', 'iso_image_2_id'])]) db.close()
def test_annotate_ds(self, test_db, sm_config, ds_config): es_mock = MagicMock(spec=ESExporter) db = DB(sm_config['db']) try: manager = create_daemon_man(sm_config, db=db, es=es_mock) ds_id = '2000-01-01' ds_name = 'ds_name' input_path = 'input_path' upload_dt = datetime.now() metadata = {} ds = create_ds(ds_id=ds_id, ds_name=ds_name, input_path=input_path, upload_dt=upload_dt, metadata=metadata, ds_config=ds_config) manager.annotate(ds, search_job_factory=self.SearchJob) DS_SEL = 'select name, input_path, upload_dt, metadata, config from dataset where id=%s' assert db.select_one(DS_SEL, params=(ds_id,)) == (ds_name, input_path, upload_dt, metadata, ds_config) finally: db.close()
def test_db(sm_config, request): db_config = dict(**sm_config['db']) db_config['database'] = 'postgres' db = DB(db_config, autocommit=True) db.alter('DROP DATABASE IF EXISTS sm_test') db.alter('CREATE DATABASE sm_test') db.close() local('psql -h {} -U {} sm_test < {}'.format( sm_config['db']['host'], sm_config['db']['user'], Path(proj_root()) / 'scripts/create_schema.sql')) def fin(): db = DB(db_config, autocommit=True) try: db.alter('DROP DATABASE IF EXISTS sm_test') except Exception as e: logging.getLogger('engine').warning( 'Drop sm_test database failed: %s', e) finally: db.close() request.addfinalizer(fin)
class SearchJob(object): """ Main class responsible for molecule search. Uses other modules of the engine. Args ---- no_clean : bool Don't delete interim data files """ def __init__(self, img_store=None, no_clean=False): self.no_clean = no_clean self._img_store = img_store self._job_id = None self._sc = None self._db = None self._ds = None self._ds_reader = None self._status_queue = None self._fdr = None self._wd_manager = None self._es = None self._sm_config = SMConfig.get_conf() logger.debug('Using SM config:\n%s', pformat(self._sm_config)) def _configure_spark(self): logger.info('Configuring Spark') sconf = SparkConf() for prop, value in self._sm_config['spark'].items(): if prop.startswith('spark.'): sconf.set(prop, value) if 'aws' in self._sm_config: sconf.set("spark.hadoop.fs.s3a.access.key", self._sm_config['aws']['aws_access_key_id']) sconf.set("spark.hadoop.fs.s3a.secret.key", self._sm_config['aws']['aws_secret_access_key']) sconf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") sconf.set( "spark.hadoop.fs.s3a.endpoint", "s3.{}.amazonaws.com".format( self._sm_config['aws']['aws_region'])) self._sc = SparkContext(master=self._sm_config['spark']['master'], conf=sconf, appName='SM engine') def _init_db(self): logger.info('Connecting to the DB') self._db = DB(self._sm_config['db']) def store_job_meta(self, mol_db_id): """ Store search job metadata in the database """ logger.info('Storing job metadata') rows = [(mol_db_id, self._ds.id, JobStatus.RUNNING, datetime.now().strftime('%Y-%m-%d %H:%M:%S'))] self._job_id = self._db.insert_return(JOB_INS, rows=rows)[0] def _run_annotation_job(self, mol_db): try: self.store_job_meta(mol_db.id) mol_db.set_job_id(self._job_id) logger.info( "Running new job ds_id: %s, ds_name: %s, db_name: %s, db_version: %s", self._ds.id, self._ds.name, mol_db.name, mol_db.version) target_adducts = self._ds.config['isotope_generation']['adducts'] self._fdr = FDR(job_id=self._job_id, decoy_sample_size=20, target_adducts=target_adducts, db=self._db) isocalc = IsocalcWrapper(self._ds.config['isotope_generation']) centroids_gen = IonCentroidsGenerator(sc=self._sc, moldb_name=mol_db.name, isocalc=isocalc) polarity = self._ds.config['isotope_generation']['charge'][ 'polarity'] all_adducts = list( set(self._sm_config['defaults']['adducts'][polarity]) | set(DECOY_ADDUCTS)) centroids_gen.generate_if_not_exist(isocalc=isocalc, sfs=mol_db.sfs, adducts=all_adducts) target_ions = centroids_gen.ions(target_adducts) self._fdr.decoy_adducts_selection(target_ions) search_alg = MSMBasicSearch(sc=self._sc, ds=self._ds, ds_reader=self._ds_reader, mol_db=mol_db, centr_gen=centroids_gen, fdr=self._fdr, ds_config=self._ds.config) ion_metrics_df, ion_iso_images = search_alg.search() search_results = SearchResults(mol_db.id, self._job_id, search_alg.metrics.keys()) mask = self._ds_reader.get_2d_sample_area_mask() img_store_type = self._ds.get_ion_img_storage_type(self._db) search_results.store(ion_metrics_df, ion_iso_images, mask, self._db, self._img_store, img_store_type) except Exception as e: self._db.alter( JOB_UPD_STATUS_FINISH, params=(JobStatus.FAILED, datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id)) msg = 'Job failed(ds_id={}, mol_db={}): {}'.format( self._ds.id, mol_db, str(e)) raise JobFailedError(msg) from e else: self._db.alter( JOB_UPD_STATUS_FINISH, params=(JobStatus.FINISHED, datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id)) def _remove_annotation_job(self, mol_db): logger.info( "Removing job results ds_id: %s, ds_name: %s, db_name: %s, db_version: %s", self._ds.id, self._ds.name, mol_db.name, mol_db.version) self._db.alter('DELETE FROM job WHERE ds_id = %s and db_id = %s', params=(self._ds.id, mol_db.id)) self._es.delete_ds(self._ds.id, mol_db) def _moldb_ids(self): completed_moldb_ids = { db_id for (_, db_id) in self._db.select(JOB_ID_MOLDB_ID_SEL, params=(self._ds.id, )) } new_moldb_ids = { MolecularDB(name=moldb_name).id for moldb_name in self._ds.config['databases'] } return completed_moldb_ids, new_moldb_ids def _save_data_from_raw_ms_file(self): ms_file_type_config = SMConfig.get_ms_file_handler( self._wd_manager.local_dir.ms_file_path) acq_geometry_factory_module = ms_file_type_config[ 'acq_geometry_factory'] acq_geometry_factory = getattr( import_module(acq_geometry_factory_module['path']), acq_geometry_factory_module['name']) acq_geometry = acq_geometry_factory( self._wd_manager.local_dir.ms_file_path).create() self._ds.save_acq_geometry(self._db, acq_geometry) self._ds.save_ion_img_storage_type( self._db, ms_file_type_config['img_storage_type']) def run(self, ds): """ Entry point of the engine. Molecule search is completed in several steps: * Copying input data to the engine work dir * Conversion input mass spec files to plain text format. One line - one spectrum data * Generation and saving to the database theoretical peaks for all formulas from the molecule database * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner. * Saving results (isotope images and their metrics of quality for each putative molecule) to the database Args ---- ds : sm.engine.dataset_manager.Dataset """ try: logger.info('*' * 150) start = time.time() self._init_db() self._es = ESExporter(self._db) self._ds = ds if self._sm_config['rabbitmq']: self._status_queue = QueuePublisher( config=self._sm_config['rabbitmq'], qdesc=SM_DS_STATUS, logger=logger) else: self._status_queue = None self._wd_manager = WorkDirManager(ds.id) self._configure_spark() if not self.no_clean: self._wd_manager.clean() self._ds_reader = DatasetReader(self._ds.input_path, self._sc, self._wd_manager) self._ds_reader.copy_convert_input_data() self._save_data_from_raw_ms_file() self._img_store.storage_type = self._ds.get_ion_img_storage_type( self._db) logger.info('Dataset config:\n%s', pformat(self._ds.config)) completed_moldb_ids, new_moldb_ids = self._moldb_ids() for moldb_id in completed_moldb_ids.symmetric_difference( new_moldb_ids): # ignore ids present in both sets mol_db = MolecularDB( id=moldb_id, db=self._db, iso_gen_config=self._ds.config['isotope_generation']) if moldb_id not in new_moldb_ids: self._remove_annotation_job(mol_db) elif moldb_id not in completed_moldb_ids: self._run_annotation_job(mol_db) logger.info("All done!") time_spent = time.time() - start logger.info('Time spent: %d mins %d secs', *divmod(int(round(time_spent)), 60)) finally: if self._sc: self._sc.stop() if self._db: self._db.close() if self._wd_manager and not self.no_clean: self._wd_manager.clean() logger.info('*' * 150)
class SMAnnotateDaemon(object): """ Reads messages from annotation queue and starts annotation jobs """ logger = logging.getLogger('annotate-daemon') def __init__(self, manager, annot_qdesc, upd_qdesc, poll_interval=1): self._sm_config = SMConfig.get_conf() self._stopped = False self._annot_queue_consumer = QueueConsumer( config=self._sm_config['rabbitmq'], qdesc=annot_qdesc, callback=self._callback, on_success=self._on_success, on_failure=self._on_failure, logger=self.logger, poll_interval=poll_interval) self._upd_queue_pub = QueuePublisher( config=self._sm_config['rabbitmq'], qdesc=upd_qdesc, logger=self.logger) self._db = DB(self._sm_config['db']) self._manager = manager def _send_email(self, email, subj, body): try: cred_dict = dict( aws_access_key_id=self._sm_config['aws']['aws_access_key_id'], aws_secret_access_key=self._sm_config['aws'] ['aws_secret_access_key']) ses = boto3.client('ses', 'eu-west-1', **cred_dict) resp = ses.send_email(Source='*****@*****.**', Destination={'ToAddresses': [email]}, Message={ 'Subject': { 'Data': subj }, 'Body': { 'Text': { 'Data': body } } }) except Exception as e: self.logger.warning(f'Send email exception {e} for {email}') else: if resp['ResponseMetadata']['HTTPStatusCode'] == 200: self.logger.info( f'Email with "{subj}" subject was sent to {email}') else: self.logger.warning(f'SEM failed to send email to {email}') def _on_success(self, msg): ds = Dataset.load(self._db, msg['ds_id']) ds.set_status(self._db, self._manager.es, self._manager.status_queue, DatasetStatus.FINISHED) self.logger.info(f" SM annotate daemon: success") ds_name, _ = self._manager.fetch_ds_metadata(msg['ds_id']) msg['web_app_link'] = self._manager.create_web_app_link(msg) self._manager.post_to_slack( 'dart', ' [v] Annotation succeeded: {}'.format(json.dumps(msg))) if msg.get('email'): email_body = ( 'Dear METASPACE user,\n\n' 'Thank you for uploading the "{}" dataset to the METASPACE annotation service. ' 'We are pleased to inform you that the dataset has been processed and is available at {}.\n\n' 'Best regards,\n' 'METASPACE Team').format(ds_name, msg['web_app_link']) self._send_email(msg['email'], 'METASPACE service notification (SUCCESS)', email_body) def _on_failure(self, msg): ds = Dataset.load(self._db, msg['ds_id']) ds.set_status(self._db, self._manager.es, self._manager.status_queue, DatasetStatus.FAILED) self.logger.error(f" SM annotate daemon: failure", exc_info=True) ds_name, _ = self._manager.fetch_ds_metadata(msg['ds_id']) msg['web_app_link'] = self._manager.create_web_app_link(msg) self._manager.post_to_slack( 'hankey', ' [x] Annotation failed: {}'.format(json.dumps(msg))) if msg.get('email'): email_body = ( 'Dear METASPACE user,\n\n' 'We are sorry to inform you that there was a problem during processing of the "{}" dataset ' 'and it could not be annotated. ' 'If this is unexpected, please do not hesitate to contact us for support at [email protected]\n\n' 'Best regards,\n' 'METASPACE Team').format(ds_name) self._send_email(msg['email'], 'METASPACE service notification (FAILED)', email_body) def _callback(self, msg): ds = Dataset.load(self._db, msg['ds_id']) ds.set_status(self._db, self._manager.es, self._manager.status_queue, DatasetStatus.ANNOTATING) self.logger.info(f" SM annotate daemon received a message: {msg}") self._manager.post_to_slack( 'new', " [v] New annotation message: {}".format(json.dumps(msg))) self._manager.annotate(ds=ds, search_job_factory=SearchJob, del_first=msg.get('del_first', False)) upd_msg = { 'ds_id': msg['ds_id'], 'ds_name': msg['ds_name'], 'action': 'update' } self._upd_queue_pub.publish(msg=upd_msg, priority=2) def start(self): self._stopped = False self._annot_queue_consumer.start() def stop(self): if not self._stopped: self._annot_queue_consumer.stop() self._annot_queue_consumer.join() self._stopped = True if self._db: self._db.close()
class SearchJob(object): """ Main class responsible for molecule search. Uses other modules of the engine. Args ---- no_clean : bool Don't delete interim data files """ def __init__(self, img_store=None, no_clean=False): self.no_clean = no_clean self._img_store = img_store self._job_id = None self._sc = None self._db = None self._ds = None self._ds_reader = None self._status_queue = None self._fdr = None self._wd_manager = None self._es = None self._sm_config = SMConfig.get_conf() logger.debug('Using SM config:\n%s', pformat(self._sm_config)) def _configure_spark(self): logger.info('Configuring Spark') sconf = SparkConf() for prop, value in self._sm_config['spark'].items(): if prop.startswith('spark.'): sconf.set(prop, value) if 'aws' in self._sm_config: sconf.set("spark.hadoop.fs.s3a.access.key", self._sm_config['aws']['aws_access_key_id']) sconf.set("spark.hadoop.fs.s3a.secret.key", self._sm_config['aws']['aws_secret_access_key']) sconf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") sconf.set("spark.hadoop.fs.s3a.endpoint", "s3.{}.amazonaws.com".format(self._sm_config['aws']['aws_region'])) self._sc = SparkContext(master=self._sm_config['spark']['master'], conf=sconf, appName='SM engine') def _init_db(self): logger.info('Connecting to the DB') self._db = DB(self._sm_config['db']) def store_job_meta(self, mol_db_id): """ Store search job metadata in the database """ logger.info('Storing job metadata') rows = [(mol_db_id, self._ds.id, 'STARTED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))] self._job_id = self._db.insert_return(JOB_INS, rows=rows)[0] def _run_annotation_job(self, mol_db): try: self.store_job_meta(mol_db.id) mol_db.set_job_id(self._job_id) logger.info("Running new job ds_id: %s, ds_name: %s, db_name: %s, db_version: %s", self._ds.id, self._ds.name, mol_db.name, mol_db.version) target_adducts = self._ds.config['isotope_generation']['adducts'] self._fdr = FDR(job_id=self._job_id, decoy_sample_size=20, target_adducts=target_adducts, db=self._db) isocalc = IsocalcWrapper(self._ds.config['isotope_generation']) centroids_gen = IonCentroidsGenerator(sc=self._sc, moldb_name=mol_db.name, isocalc=isocalc) polarity = self._ds.config['isotope_generation']['charge']['polarity'] all_adducts = list(set(self._sm_config['defaults']['adducts'][polarity]) | set(DECOY_ADDUCTS)) centroids_gen.generate_if_not_exist(isocalc=isocalc, sfs=mol_db.sfs, adducts=all_adducts) target_ions = centroids_gen.ions(target_adducts) self._fdr.decoy_adducts_selection(target_ions) search_alg = MSMBasicSearch(sc=self._sc, ds=self._ds, ds_reader=self._ds_reader, mol_db=mol_db, centr_gen=centroids_gen, fdr=self._fdr, ds_config=self._ds.config) ion_metrics_df, ion_iso_images = search_alg.search() search_results = SearchResults(mol_db.id, self._job_id, search_alg.metrics.keys()) mask = self._ds_reader.get_2d_sample_area_mask() img_store_type = self._ds.get_ion_img_storage_type(self._db) search_results.store(ion_metrics_df, ion_iso_images, mask, self._db, self._img_store, img_store_type) except Exception as e: self._db.alter(JOB_UPD, params=('FAILED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id)) msg = 'Job failed(ds_id={}, mol_db={}): {}'.format(self._ds.id, mol_db, str(e)) raise JobFailedError(msg) from e else: self._export_search_results_to_es(mol_db, isocalc) def _export_search_results_to_es(self, mol_db, isocalc): try: self._es.index_ds(self._ds.id, mol_db, isocalc) except Exception as e: self._db.alter(JOB_UPD, params=('FAILED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id)) msg = 'Export to ES failed(ds_id={}, mol_db={}): {}'.format(self._ds.id, mol_db, str(e)) raise ESExportFailedError(msg) from e else: self._db.alter(JOB_UPD, params=('FINISHED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id)) def _remove_annotation_job(self, mol_db): logger.info("Removing job results ds_id: %s, ds_name: %s, db_name: %s, db_version: %s", self._ds.id, self._ds.name, mol_db.name, mol_db.version) self._db.alter('DELETE FROM job WHERE ds_id = %s and db_id = %s', params=(self._ds.id, mol_db.id)) self._es.delete_ds(self._ds.id, mol_db) def _moldb_ids(self): moldb_service = MolDBServiceWrapper(self._sm_config['services']['mol_db']) completed_moldb_ids = {moldb_service.find_db_by_id(db_id)['id'] for (_, db_id) in self._db.select(JOB_ID_MOLDB_ID_SEL, params=(self._ds.id,))} new_moldb_ids = {moldb_service.find_db_by_name_version(moldb_name)[0]['id'] for moldb_name in self._ds.config['databases']} return completed_moldb_ids, new_moldb_ids def _save_data_from_raw_ms_file(self): ms_file_type_config = SMConfig.get_ms_file_handler(self._wd_manager.local_dir.ms_file_path) acq_geometry_factory_module = ms_file_type_config['acq_geometry_factory'] acq_geometry_factory = getattr(import_module(acq_geometry_factory_module['path']), acq_geometry_factory_module['name']) acq_geometry = acq_geometry_factory(self._wd_manager.local_dir.ms_file_path).create() self._ds.save_acq_geometry(self._db, acq_geometry) self._ds.save_ion_img_storage_type(self._db, ms_file_type_config['img_storage_type']) def run(self, ds): """ Entry point of the engine. Molecule search is completed in several steps: * Copying input data to the engine work dir * Conversion input mass spec files to plain text format. One line - one spectrum data * Generation and saving to the database theoretical peaks for all formulas from the molecule database * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner. * Saving results (isotope images and their metrics of quality for each putative molecule) to the database Args ---- ds : sm.engine.dataset_manager.Dataset """ try: start = time.time() self._init_db() self._es = ESExporter(self._db) self._ds = ds if self._sm_config['rabbitmq']: self._status_queue = QueuePublisher(config=self._sm_config['rabbitmq'], qdesc=SM_DS_STATUS, logger=logger) else: self._status_queue = None ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.STARTED) self._wd_manager = WorkDirManager(ds.id) self._configure_spark() if not self.no_clean: self._wd_manager.clean() self._ds_reader = DatasetReader(self._ds.input_path, self._sc, self._wd_manager) self._ds_reader.copy_convert_input_data() self._save_data_from_raw_ms_file() self._img_store.storage_type = self._ds.get_ion_img_storage_type(self._db) ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.STARTED) logger.info('Dataset config:\n%s', pformat(self._ds.config)) completed_moldb_ids, new_moldb_ids = self._moldb_ids() for moldb_id in completed_moldb_ids.symmetric_difference(new_moldb_ids): # ignore ids present in both sets mol_db = MolecularDB(id=moldb_id, db=self._db, iso_gen_config=self._ds.config['isotope_generation']) if moldb_id not in new_moldb_ids: self._remove_annotation_job(mol_db) elif moldb_id not in completed_moldb_ids: self._run_annotation_job(mol_db) ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.FINISHED) logger.info("All done!") time_spent = time.time() - start logger.info('Time spent: %d mins %d secs', *divmod(int(round(time_spent)), 60)) except Exception as e: if self._ds: ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.FAILED) logger.error(e, exc_info=True) raise finally: if self._sc: self._sc.stop() if self._db: self._db.close() if self._wd_manager and not self.no_clean: self._wd_manager.clean() logger.info('*' * 150)
def test_sm_daemons(get_compute_img_metrics_mock, filter_sf_metrics_mock, post_images_to_annot_service_mock, MolDBServiceWrapperMock, sm_config, test_db, es_dsl_search, clean_isotope_storage): init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock) get_compute_img_metrics_mock.return_value = lambda *args: (0.9, 0.9, 0.9, [100.], [0], [10.]) filter_sf_metrics_mock.side_effect = lambda x: x url_dict = { 'iso_image_ids': ['iso_image_1', None, None, None] } post_images_to_annot_service_mock.return_value = { 35: url_dict, 44: url_dict } db = DB(sm_config['db']) es = ESExporter(db) annotate_daemon = None update_daemon = None try: ds_config_str = open(ds_config_path).read() upload_dt = datetime.now() ds_id = '2000-01-01_00h00m' db.insert(Dataset.DS_INSERT, [{ 'id': ds_id, 'name': test_ds_name, 'input_path': input_dir_path, 'upload_dt': upload_dt, 'metadata': '{"Data_Type": "Imaging MS"}', 'config': ds_config_str, 'status': DatasetStatus.QUEUED, 'is_public': True, 'mol_dbs': ['HMDB-v4'], 'adducts': ['+H', '+Na', '+K'], 'ion_img_storage': 'fs' }]) ds = Dataset.load(db, ds_id) queue_pub.publish({'ds_id': ds.id, 'ds_name': ds.name, 'action': 'annotate'}) run_daemons(db, es) # dataset table asserts rows = db.select('SELECT id, name, input_path, upload_dt, status from dataset') input_path = join(dirname(__file__), 'data', test_ds_name) assert len(rows) == 1 assert rows[0] == (ds_id, test_ds_name, input_path, upload_dt, DatasetStatus.FINISHED) # ms acquisition geometry asserts rows = db.select('SELECT acq_geometry from dataset') assert len(rows) == 1 assert rows[0][0] == ds.get_acq_geometry(db) assert rows[0][0] == { ACQ_GEOMETRY_KEYS.LENGTH_UNIT: 'nm', ACQ_GEOMETRY_KEYS.AcqGridSection.section_name: { ACQ_GEOMETRY_KEYS.AcqGridSection.REGULAR_GRID: True, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_X : 3, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_Y : 3, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_X : 100, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_Y : 100 }, ACQ_GEOMETRY_KEYS.PixelSizeSection.section_name: { ACQ_GEOMETRY_KEYS.PixelSizeSection.REGULAR_SIZE: True, ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_X : 100, ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_Y : 100 } } # job table asserts rows = db.select('SELECT db_id, ds_id, status, start, finish from job') assert len(rows) == 1 db_id, ds_id, status, start, finish = rows[0] assert (db_id, ds_id, status) == (0, '2000-01-01_00h00m', JobStatus.FINISHED) assert start < finish # image metrics asserts rows = db.select(('SELECT db_id, sf, adduct, stats, iso_image_ids ' 'FROM iso_image_metrics ' 'ORDER BY sf, adduct')) assert rows[0] == (0, 'C12H24O', '+K', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9, 'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]}, ['iso_image_1', None, None, None]) assert rows[1] == (0, 'C12H24O', '+Na', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9, 'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]}, ['iso_image_1', None, None, None]) time.sleep(1) # Waiting for ES # ES asserts ds_docs = es_dsl_search.query('term', _type='dataset').execute().to_dict()['hits']['hits'] assert 1 == len(ds_docs) ann_docs = es_dsl_search.query('term', _type='annotation').execute().to_dict()['hits']['hits'] assert len(ann_docs) == len(rows) for doc in ann_docs: assert doc['_id'].startswith(ds_id) finally: db.close() if annotate_daemon: annotate_daemon.stop() if update_daemon: update_daemon.stop() with warn_only(): local('rm -rf {}'.format(data_dir_path))
def test_search_job_imzml_example(get_compute_img_metrics_mock, filter_sf_metrics_mock, post_images_to_annot_service_mock, MolDBServiceWrapperMock, MolDBServiceWrapperMock2, sm_config, create_fill_sm_database, es_dsl_search, clean_isotope_storage): init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock) init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock2) get_compute_img_metrics_mock.return_value = lambda *args: (0.9, 0.9, 0.9, [100.], [0], [10.]) filter_sf_metrics_mock.side_effect = lambda x: x url_dict = { 'iso_image_ids': ['iso_image_1', None, None, None] } post_images_to_annot_service_mock.return_value = { 35: url_dict, 44: url_dict } db = DB(sm_config['db']) try: ds_config_str = open(ds_config_path).read() upload_dt = datetime.now() ds_id = '2000-01-01_00h00m' db.insert(Dataset.DS_INSERT, [{ 'id': ds_id, 'name': test_ds_name, 'input_path': input_dir_path, 'upload_dt': upload_dt, 'metadata': '{}', 'config': ds_config_str, 'status': DatasetStatus.QUEUED, 'is_public': True, 'mol_dbs': ['HMDB-v4'], 'adducts': ['+H', '+Na', '+K'], 'ion_img_storage': 'fs' }]) img_store = ImageStoreServiceWrapper(sm_config['services']['img_service_url']) job = SearchJob(img_store=img_store) job._sm_config['rabbitmq'] = {} # avoid talking to RabbitMQ during the test ds = Dataset.load(db, ds_id) job.run(ds) # dataset table asserts rows = db.select('SELECT id, name, input_path, upload_dt, status from dataset') input_path = join(dirname(__file__), 'data', test_ds_name) assert len(rows) == 1 assert rows[0] == (ds_id, test_ds_name, input_path, upload_dt, DatasetStatus.FINISHED) # ms acquisition geometry asserts rows = db.select('SELECT acq_geometry from dataset') assert len(rows) == 1 assert rows[0][0] == ds.get_acq_geometry(db) assert rows[0][0] == { ACQ_GEOMETRY_KEYS.LENGTH_UNIT: 'nm', ACQ_GEOMETRY_KEYS.AcqGridSection.section_name: { ACQ_GEOMETRY_KEYS.AcqGridSection.REGULAR_GRID: True, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_X : 3, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_Y : 3, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_X : 100, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_Y : 100 }, ACQ_GEOMETRY_KEYS.PixelSizeSection.section_name: { ACQ_GEOMETRY_KEYS.PixelSizeSection.REGULAR_SIZE: True, ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_X : 100, ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_Y : 100 } } # job table asserts rows = db.select('SELECT db_id, ds_id, status, start, finish from job') assert len(rows) == 1 db_id, ds_id, status, start, finish = rows[0] assert (db_id, ds_id, status) == (0, '2000-01-01_00h00m', 'FINISHED') assert start < finish # image metrics asserts rows = db.select(('SELECT db_id, sf, adduct, stats, iso_image_ids ' 'FROM iso_image_metrics ' 'ORDER BY sf, adduct')) assert rows[0] == (0, 'C12H24O', '+K', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9, 'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]}, ['iso_image_1', None, None, None]) assert rows[1] == (0, 'C12H24O', '+Na', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9, 'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]}, ['iso_image_1', None, None, None]) time.sleep(1) # Waiting for ES # ES asserts ds_docs = es_dsl_search.query('term', _type='dataset').execute().to_dict()['hits']['hits'] assert 1 == len(ds_docs) ann_docs = es_dsl_search.query('term', _type='annotation').execute().to_dict()['hits']['hits'] assert len(ann_docs) == len(rows) for doc in ann_docs: assert doc['_id'].startswith(ds_id) finally: db.close() with warn_only(): local('rm -rf {}'.format(data_dir_path))
class SMUpdateDaemon(object): """ Reads messages from update queue and does updates/deletes """ logger = logging.getLogger('update-daemon') def __init__(self, manager, update_qdesc, poll_interval=1): self._manager = manager self._sm_config = SMConfig.get_conf() self._db = DB(self._sm_config['db']) self._update_queue_cons = QueueConsumer( config=self._sm_config['rabbitmq'], qdesc=update_qdesc, callback=self._callback, on_success=self._on_success, on_failure=self._on_failure, logger=self.logger, poll_interval=poll_interval) self._status_queue_pub = QueuePublisher( config=self._sm_config['rabbitmq'], qdesc=SM_DS_STATUS, logger=self.logger) self._stopped = False def _post_to_slack(self, msg): if msg['action'] == 'update': msg['web_app_link'] = self._manager.create_web_app_link(msg) self._manager.post_to_slack( 'dart', f' [v] Update succeeded: {json.dumps(msg)}') elif msg['action'] == 'delete': self._manager.post_to_slack( 'dart', f' [v] Delete succeeded: {json.dumps(msg)}') def _on_success(self, msg): ds = Dataset.load(self._db, msg['ds_id']) ds.set_status(self._db, self._manager.es, self._manager.status_queue, DatasetStatus.FINISHED) self.logger.info(f" SM update daemon: success") self._post_to_slack(msg) def _on_failure(self, msg): ds = Dataset.load(self._db, msg['ds_id']) ds.set_status(self._db, self._manager.es, self._manager.status_queue, DatasetStatus.FAILED) self.logger.error(f" SM update daemon: failure", exc_info=True) self._post_to_slack(msg) def _callback(self, msg): ds = Dataset.load(self._db, msg['ds_id']) ds.set_status(self._db, self._manager.es, self._manager.status_queue, DatasetStatus.INDEXING) self.logger.info(f' SM update daemon received a message: {msg}') self._manager.post_to_slack( 'new', f" [v] New {msg['action']} message: {json.dumps(msg)}") if msg['action'] == 'update': self._manager.index(ds=ds) elif msg['action'] == 'delete': self._manager.delete(ds=ds) else: raise Exception(f"Wrong action: {msg['action']}") def start(self): self._stopped = False self._update_queue_cons.start() def stop(self): if not self._stopped: self._update_queue_cons.stop() self._update_queue_cons.join() self._stopped = True if self._db: self._db.close()
class SearchJob(object): """ Main class responsible for molecule search. Uses other modules of the engine. Args ---------- ds_name : string A dataset short name """ def __init__(self, client_email, ds_name): self.sm_config = SMConfig.get_conf() self.client_email = client_email self.ds_name = ds_name self.ds_id = None self.job_id = None self.sc = None self.db = None self.ds = None self.fdr = None self.formulas = None self.ds_config = None self.wd_manager = None def _read_ds_config(self): with open(self.wd_manager.ds_config_path) as f: self.ds_config = json.load(f) def _configure_spark(self): logger.info('Configuring Spark') sconf = SparkConf() for prop, value in self.sm_config['spark'].iteritems(): if prop.startswith('spark.'): sconf.set(prop, value) if 'aws' in self.sm_config: sconf.set("spark.hadoop.fs.s3a.access.key", self.sm_config['aws']['aws_access_key_id']) sconf.set("spark.hadoop.fs.s3a.secret.key", self.sm_config['aws']['aws_secret_access_key']) sconf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") # sconf.set("spark.python.profile", "true") self.sc = SparkContext(master=self.sm_config['spark']['master'], conf=sconf, appName='SM engine') if not self.sm_config['spark']['master'].startswith('local'): self.sc.addPyFile(join(local_path(proj_root()), 'sm.zip')) def _init_db(self): logger.info('Connecting to the DB') self.db = DB(self.sm_config['db']) self.sf_db_id = self.db.select_one( DB_ID_SEL, self.ds_config['database']['name'])[0] def store_job_meta(self): """ Store search job metadata in the database """ logger.info('Storing job metadata') self.ds_id = int(self.db.select_one(DS_ID_SEL, self.ds_name)[0]) self.job_id = self.ds_id self.db.alter(DEL_JOB_SQL, self.job_id) rows = [(self.job_id, self.sf_db_id, self.ds_id, datetime.now().strftime('%Y-%m-%d %H:%M:%S'))] self.db.insert(JOB_INS, rows) rows = [(self.job_id, adduct) for adduct in self.ds_config['isotope_generation']['adducts']] self.db.insert(ADDUCT_INS, rows) def run(self, input_path, ds_config_path, clean=False): """ Entry point of the engine. Molecule search is completed in several steps: * Copying input data to the engine work dir * Conversion input data (imzML+ibd) to plain text format. One line - one spectrum data * Generation and saving to the database theoretical peaks for all formulas from the molecule database * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner. * Saving results (isotope images and their metrics of quality for each putative molecule) to the database Args ------- input_path : string Path to the dataset folder with .imzML and .ibd files ds_config_path: string Path to the dataset config file clean : bool Clean all interim data files before starting molecule search """ try: self.wd_manager = WorkDirManager(self.ds_name) if clean: self.wd_manager.clean() self.wd_manager.copy_input_data(input_path, ds_config_path) self._read_ds_config() logger.info('Dataset config:\n%s', pformat(self.ds_config)) self._configure_spark() self._init_db() if not self.wd_manager.exists(self.wd_manager.txt_path): imzml_converter = ImzmlTxtConverter( self.ds_name, self.wd_manager.local_dir.imzml_path, self.wd_manager.local_dir.txt_path, self.wd_manager.local_dir.coord_path) imzml_converter.convert() if not self.wd_manager.local_fs_only: self.wd_manager.upload_to_remote() self.ds = Dataset(self.sc, self.ds_name, self.client_email, input_path, self.ds_config, self.wd_manager, self.db) self.ds.save_ds_meta() self.store_job_meta() theor_peaks_gen = TheorPeaksGenerator(self.sc, self.sm_config, self.ds_config) theor_peaks_gen.run() target_adducts = self.ds_config['isotope_generation']['adducts'] self.fdr = FDR(self.job_id, self.sf_db_id, decoy_sample_size=20, target_adducts=target_adducts, db=self.db) self.fdr.decoy_adduct_selection() self.formulas = FormulasSegm(self.job_id, self.sf_db_id, self.ds_config, self.db) # search_alg = MSMBasicSearch(self.sc, self.ds, self.formulas, self.fdr, self.ds_config) search_alg = MSMExtraFeats(self.sc, self.ds, self.formulas, self.fdr, self.ds_config) sf_metrics_df, sf_iso_images = search_alg.search() search_results = SearchResults( self.sf_db_id, self.ds_id, self.job_id, self.ds_name, self.formulas.get_sf_adduct_peaksn(), self.db, self.sm_config, self.ds_config) search_results.sf_metrics_df = sf_metrics_df search_results.sf_iso_images = sf_iso_images search_results.metrics = search_alg.metrics search_results.nrows, search_results.ncols = self.ds.get_dims() search_results.store() es = ESExporter(self.sm_config) es.index_ds(self.db, self.ds_name, self.ds_config['database']['name']) except Exception: exc_type, exc_value, exc_traceback = sys.exc_info() logger.error('\n'.join( traceback.format_exception(exc_type, exc_value, exc_traceback))) finally: if self.sc: # self.sc.show_profiles() self.sc.stop() if self.db: self.db.close()