def run_off_sample(sm_config, ds_ids_str, sql_where, fix_missing, overwrite_existing): db = DB() ds_ids = None if ds_ids_str: ds_ids = ds_ids_str.split(',') elif sql_where: ds_ids = [ id for (id,) in db.select(f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}') ] elif fix_missing: logger.info('Checking for missing off-sample jobs...') results = db.select(MISSING_OFF_SAMPLE_SEL) ds_ids = [ds_id for ds_id, in results] logger.info(f'Found {len(ds_ids)} missing off-sample sets') if not ds_ids: logger.warning('No datasets match filter') return es_exp = ESExporter(db, sm_config) for i, ds_id in enumerate(ds_ids): try: logger.info(f'Running off-sample on {i+1} out of {len(ds_ids)}') ds = Dataset.load(db, ds_id) classify_dataset_ion_images(db, ds, sm_config['services'], overwrite_existing) es_exp.reindex_ds(ds_id) except Exception: logger.error(f'Failed to run off-sample on {ds_id}', exc_info=True)
def test_delete_ds__completely(es, sm_index, sm_config): index = sm_config['elasticsearch']['index'] es.create(index=index, doc_type='annotation', id='id1', body={'ds_id': 'dataset1', 'db_name': 'HMDB', 'db_version': '2016'}) es.create(index=index, doc_type='annotation', id='id2', body={'ds_id': 'dataset1', 'db_name': 'ChEBI', 'db_version': '2016'}) es.create(index=index, doc_type='annotation', id='id3', body={'ds_id': 'dataset2', 'db_name': 'HMDB', 'db_version': '2016'}) es.create(index=index, doc_type='dataset', id='dataset1', body={'ds_id': 'dataset1', 'db_name': 'HMDB', 'db_version': '2016'}) wait_for_es(sec=1) db_mock = MagicMock(spec=DB) es_exporter = ESExporter(db_mock) es_exporter.delete_ds(ds_id='dataset1') wait_for_es(sec=1) body = { 'query': { 'bool': { 'filter': [] } } } body['query']['bool']['filter'] = [{'term': {'ds_id': 'dataset1'}}, {'term': {'db_name': 'HMDB'}}] assert es.count(index=index, doc_type='annotation', body=body)['count'] == 0 body['query']['bool']['filter'] = [{'term': {'ds_id': 'dataset1'}}, {'term': {'db_name': 'ChEBI'}}] assert es.count(index=index, doc_type='annotation', body=body)['count'] == 0 body['query']['bool']['filter'] = [{'term': {'ds_id': 'dataset2'}}, {'term': {'db_name': 'HMDB'}}] assert es.count(index=index, doc_type='annotation', body=body)['count'] == 1 body['query']['bool']['filter'] = [{'term': {'ds_id': 'dataset1'}}, {'term': {'_type': 'dataset'}}] assert es.count(index=index, doc_type='dataset', body=body)['count'] == 0
def test_delete_ds__completely(es, sm_index, sm_config): index = sm_config['elasticsearch']['index'] es.create(index=index, doc_type='annotation', id='id1', body={'ds_id': 'dataset1', 'db_name': 'HMDB', 'db_version': '2016'}) es.create(index=index, doc_type='annotation', id='id2', body={'ds_id': 'dataset1', 'db_name': 'ChEBI', 'db_version': '2016'}) es.create(index=index, doc_type='annotation', id='id3', body={'ds_id': 'dataset2', 'db_name': 'HMDB', 'db_version': '2016'}) es.create(index=index, doc_type='dataset', id='dataset1', body={'ds_id': 'dataset1', 'db_name': 'HMDB', 'db_version': '2016'}) wait_for_es(sec=1) db_mock = MagicMock(spec=DB) es_exporter = ESExporter(db_mock) es_exporter.delete_ds(ds_id='dataset1') wait_for_es(sec=1) body = { 'query': { 'bool': { 'filter': [] } } } body['query']['bool']['filter'] = [{'term': {'ds_id': 'dataset1'}}, {'term': {'db_name': 'HMDB'}}] assert es.count(index=index, doc_type='annotation', body=body)['count'] == 0 body['query']['bool']['filter'] = [{'term': {'ds_id': 'dataset1'}}, {'term': {'db_name': 'ChEBI'}}] assert es.count(index=index, doc_type='annotation', body=body)['count'] == 0 body['query']['bool']['filter'] = [{'term': {'ds_id': 'dataset2'}}, {'term': {'db_name': 'HMDB'}}] assert es.count(index=index, doc_type='annotation', body=body)['count'] == 1 body['query']['bool']['filter'] = [{'term': {'ds_id': 'dataset1'}}, {'term': {'_type': 'dataset'}}] assert es.count(index=index, doc_type='dataset', body=body)['count'] == 0
def test_foo(sm_config): annotations = [('test_ds', 'test_db', 'H20', '+H', [], []), ('test_ds', 'test_db', 'Au', '+H', [], [])] db_mock = MagicMock(DB) db_mock.select.return_value = annotations es_exp = ESExporter(sm_config) es_exp.index_ds(db_mock, 'test_ds', 'test_db') es = Elasticsearch() d = es.get(index='sm', id='test_ds_test_db_H20_+H', doc_type='annotation', _source=True) assert d['_source'] == {'ds_name': 'test_ds', 'db_name': 'test_db', 'sf': 'H20', 'adduct': '+H', 'comp_names': '', 'comp_ids': ''} d = es.get(index='sm', id='test_ds_test_db_Au_+H', doc_type='annotation', _source=True) assert d['_source'] == {'ds_name': 'test_ds', 'db_name': 'test_db', 'sf': 'Au', 'adduct': '+H', 'comp_names': '', 'comp_ids': ''}
def reindex_results(sm_config, ds_id, ds_mask, use_inactive_index, offline_reindex, update_fields): assert ds_id or ds_mask or offline_reindex IsocalcWrapper.set_centroids_cache_enabled(True) if offline_reindex: _reindex_all(sm_config) else: es_config = sm_config['elasticsearch'] if use_inactive_index: es_config = get_inactive_index_es_config(es_config) db = DB() es_exp = ESExporter(db, sm_config={ **sm_config, 'elasticsearch': es_config }) if ds_id: ds_ids = ds_id.split(',') elif ds_mask: ds_ids = [ id for (id, ) in db.select( "select id from dataset where name like '{}%'".format( ds_mask)) ] else: ds_ids = [] if update_fields: _partial_update_datasets(ds_ids, es_exp, update_fields.split(',')) else: _reindex_datasets(ds_ids, es_exp)
def migrate_isotopic_images(ds_id): output.print('Migrating isotopic images') db = DB() image_ids = db.select_onecol(SEL_DS_IMG_IDS, params=(ds_id,)) es_exporter = ESExporter(db, sm_config) if image_ids and not _es_docs_migrated(es_exporter._es, ds_id): with timeit(): output.print('Transferring images...') output.print(len(image_ids)) transfer_images(ds_id, 'iso_images', image_storage.ISO, image_ids) with timeit(): output.print('Reindexing ES documents...') es_exporter.reindex_ds(ds_id)
def test_foo(sm_config): annotations = [('test_ds', 'test_db', 'H20', '+H', [], [], 100), ('test_ds', 'test_db', 'Au', '+H', [], [], 200)] db_mock = MagicMock(DB) db_mock.select.return_value = annotations es_exp = ESExporter(sm_config) es_exp.index_ds(db_mock, 'test_ds', 'test_db') es = Elasticsearch() d = es.get(index='sm', id='test_ds_test_db_H20_+H', doc_type='annotation', _source=True) assert d['_source'] == {'ds_name': 'test_ds', 'db_name': 'test_db', 'sf': 'H20', 'adduct': '+H', 'comp_names': '', 'comp_ids': '', 'mz': '00100.0000'} d = es.get(index='sm', id='test_ds_test_db_Au_+H', doc_type='annotation', _source=True) assert d['_source'] == {'ds_name': 'test_ds', 'db_name': 'test_db', 'sf': 'Au', 'adduct': '+H', 'comp_names': '', 'comp_ids': '', 'mz': '00200.0000'}
def test_sm_daemons_annot_fails(get_compute_img_metrics_mock, filter_sf_metrics_mock, post_images_to_annot_service_mock, MolDBServiceWrapperMock, sm_config, test_db, es_dsl_search, clean_isotope_storage): init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock) def throw_exception_function(*args): raise Exception('Test') get_compute_img_metrics_mock.return_value = throw_exception_function filter_sf_metrics_mock.side_effect = lambda x: x url_dict = { 'iso_image_ids': ['iso_image_1', None, None, None] } post_images_to_annot_service_mock.return_value = { 35: url_dict, 44: url_dict } db = DB(sm_config['db']) es = ESExporter(db) annotate_daemon = None try: ds_id = '2000-01-01_00h00m' upload_dt = datetime.now() ds_config_str = open(ds_config_path).read() db.insert(Dataset.DS_INSERT, [{ 'id': ds_id, 'name': test_ds_name, 'input_path': input_dir_path, 'upload_dt': upload_dt, 'metadata': '{}', 'config': ds_config_str, 'status': DatasetStatus.QUEUED, 'is_public': True, 'mol_dbs': ['HMDB-v4'], 'adducts': ['+H', '+Na', '+K'], 'ion_img_storage': 'fs' }]) ds = Dataset.load(db, ds_id) queue_pub.publish({'ds_id': ds.id, 'ds_name': ds.name, 'action': 'annotate'}) run_daemons(db, es) # dataset and job tables asserts row = db.select_one('SELECT status from dataset') assert row[0] == 'FAILED' row = db.select_one('SELECT status from job') assert row[0] == 'FAILED' finally: db.close() if annotate_daemon: annotate_daemon.stop() with warn_only(): local('rm -rf {}'.format(data_dir_path))
def reindex_all_results(conf): db = DB(conf['db']) es_exp = ESExporter(conf) es_exp.delete_index(name='sm') es_exp.create_index(name='sm') ds_db_pairs = db.select( "select name, config -> 'database'::text -> 'name'::text from dataset") for ds_name, db_name in ds_db_pairs: es_exp.index_ds(db, ds_name, db_name)
def _create_dataset_manager(db): return SMapiDatasetManager( db=db, es=ESExporter(db, sm_config), annot_queue=_create_queue_publisher(SM_ANNOTATE), update_queue=_create_queue_publisher(SM_UPDATE), lit_queue=_create_queue_publisher(SM_LITHOPS), status_queue=_create_queue_publisher(SM_DS_STATUS), logger=logger, )
def del_jobs(ds: Dataset, moldb_ids: Optional[Iterable[int]] = None): """ Delete a dataset's jobs for the specified moldbs, or all jobs if moldb_ids is None. Also cleans up the annotations from ElasticSearch and deletes their ion images. """ db = DB() es = ESExporter(db) if moldb_ids is None: moldb_ids = get_ds_moldb_ids(ds.id) moldbs = molecular_db.find_by_ids(moldb_ids) job_ids = DB().select_onecol( 'SELECT j.id FROM job j WHERE ds_id = %s AND moldb_id = ANY(%s)', (ds.id, list(moldb_ids))) del_diagnostics(ds.id, job_ids) for moldb in moldbs: logger.info( f'Deleting isotopic images: ds_id={ds.id} ds_name={ds.name} moldb={moldb}' ) img_id_rows = db.select_onecol( 'SELECT iso_image_ids ' 'FROM annotation m ' 'JOIN job j ON j.id = m.job_id ' 'JOIN dataset d ON d.id = j.ds_id ' 'WHERE ds_id = %s AND j.moldb_id = %s', (ds.id, moldb.id), ) image_ids = [ img_id for img_ids in img_id_rows for img_id in img_ids if img_id is not None ] image_storage.delete_images(image_storage.ISO, ds.id, image_ids) logger.info( f"Deleting job results: ds_id={ds.id} ds_name={ds.name} moldb={moldb}" ) db.alter('DELETE FROM job WHERE ds_id = %s and moldb_id = %s', (ds.id, moldb.id)) es.delete_ds(ds.id, moldb)
def get_manager(): db = DB() status_queue_pub = QueuePublisher(config=sm_config['rabbitmq'], qdesc=SM_DS_STATUS, logger=logger) return DatasetManager( db=db, es=ESExporter(db, sm_config), status_queue=status_queue_pub, logger=logger, )
def _create_dataset_manager(db): config = SMConfig.get_conf() img_store = ImageStoreServiceWrapper(config['services']['img_service_url']) img_store.storage_type = 'fs' return SMapiDatasetManager( db=db, es=ESExporter(db), image_store=img_store, annot_queue=_create_queue_publisher(SM_ANNOTATE), update_queue=_create_queue_publisher(SM_UPDATE), status_queue=_create_queue_publisher(SM_DS_STATUS), logger=logger)
def __init__( self, ds: Dataset, perf: Profiler, sm_config: Optional[Dict] = None, ): self._sm_config = sm_config or SMConfig.get_conf() self._sc = None self._db = DB() self._ds = ds self._perf = perf self._es = ESExporter(self._db, self._sm_config) self._ds_data_path = None
def run_search(self, mock_img_store=False): if mock_img_store: img_store = self._create_img_store_mock() else: img_store = ImageStoreServiceWrapper( self.sm_config['services']['img_service_url']) manager = SMDaemonManager(db=self.db, es=ESExporter(self.db), img_store=img_store) ds = create_ds_from_files(self.ds_id, self.ds_name, self.input_path) from sm.engine.search_job import SearchJob manager.annotate(ds, search_job_factory=SearchJob, del_first=True)
def run_search(sm_config): db = DB() manager = DatasetManager(db, ESExporter(db, sm_config)) config_path = args.config_path or Path(args.input_path) / 'config.json' meta_path = args.meta_path or Path(args.input_path) / 'meta.json' ds = create_ds_from_files(args.ds_id, args.ds_name, args.input_path, config_path, meta_path) if args.use_lithops: manager.annotate_lithops(ds, del_first=True) else: manager.annotate(ds, del_first=True)
def reindex_all_results(conf): db = DB(conf['db']) es_exp = ESExporter(conf) es_exp.delete_index(name='sm') es_exp.create_index(name='sm') ds_db_pairs = db.select("select name, config -> 'database'::text -> 'name'::text from dataset") for ds_name, db_name in ds_db_pairs: es_exp.index_ds(db, ds_name, db_name)
def test_sm_daemons_annot_fails( MSMSearchMock, post_images_to_image_store_mock, test_db, es_dsl_search, clean_isotope_storage, reset_queues, metadata, ds_config, queue_pub, local_sm_config, ): moldb = init_moldb() def throw_exception_function(*args, **kwargs): raise Exception('Test exception') msm_algo_mock = MSMSearchMock() msm_algo_mock.search.side_effect = throw_exception_function image_ids = ['iso_image_1', None, None, None] post_images_to_image_store_mock.return_value = { 0: image_ids, 1: image_ids, 2: image_ids } db = DB() es = ESExporter(db, local_sm_config) ds = create_test_ds( name=test_ds_name, input_path=input_dir_path, config={ **ds_config, 'database_ids': [moldb.id] }, status=DatasetStatus.QUEUED, es=es, ) queue_pub.publish({ 'ds_id': ds.id, 'ds_name': test_ds_name, 'action': DaemonAction.ANNOTATE }) run_daemons(db, es, local_sm_config) # dataset and job tables asserts row = db.select_one('SELECT status from dataset') assert len(row) == 1 assert row[0] == 'FAILED'
def __init__( self, executor: Executor, ds: Dataset, perf: Profiler, sm_config: Optional[Dict] = None, use_cache=False, store_images=True, ): """ Args ======== use_cache: For development - cache the results after each pipeline step so that it's easier to quickly re-run specific steps. """ sm_config = sm_config or SMConfig.get_conf() self.sm_storage = sm_config['lithops']['sm_storage'] self.storage = Storage(sm_config['lithops']) self.s3_client = get_s3_client() self.ds = ds self.perf = perf self.store_images = store_images self.db = DB() self.es = ESExporter(self.db, sm_config) self.imzml_cobj, self.ibd_cobj = _upload_imzmls_from_prefix_if_needed( self.ds.input_path, self.storage, self.sm_storage, self.s3_client) self.moldb_defs = _upload_moldbs_from_db( self.ds.config['database_ids'], self.storage, self.sm_storage) if use_cache: cache_key: Optional[str] = jsonhash({ 'input_path': ds.input_path, 'ds': ds.config }) else: cache_key = None self.pipe = Pipeline( self.imzml_cobj, self.ibd_cobj, self.moldb_defs, self.ds.config, cache_key=cache_key, executor=executor, ) self.results_dfs = None self.png_cobjs = None self.db_formula_image_ids = None
def test_add_isomer_fields_to_anns(): ann_docs = [ { 'ion': 'H2O+H-H-', 'ion_formula': 'H2O', 'comp_ids': ['1'] }, { 'ion': 'H3O-H-', 'ion_formula': 'H2O', 'comp_ids': ['2', '3'] }, { 'ion': 'H3O+CO2-CO2-H-', 'ion_formula': 'H2O', 'comp_ids': ['2', '3', '4'] }, { 'ion': 'H2O-H-', 'ion_formula': 'H1O', 'comp_ids': ['4'] }, ] ESExporter._add_isomer_fields_to_anns(ann_docs) isomer_ions_fields = [doc['isomer_ions'] for doc in ann_docs] comps_count_fields = [doc['comps_count_with_isomers'] for doc in ann_docs] assert isomer_ions_fields == [ ['H3O-H-', 'H3O+CO2-CO2-H-'], ['H2O+H-H-', 'H3O+CO2-CO2-H-'], ['H2O+H-H-', 'H3O-H-'], [], ] assert comps_count_fields == [4, 4, 4, 1]
def set_metadata_thumbnail(db, config, ds_name): ds_thumb_query = 'SELECT id, transform, thumbnail from dataset {}'.format( 'WHERE name = %s' if ds_name != ALL_DS_MASK else '') for id, transform, thumbnail in db.select( ds_thumb_query, params=(ds_name, ) if ds_name else None): if transform != None: ds = api.Dataset.load(db=db, ds_id=id) img_store = ImageStoreServiceWrapper( config['services']['img_service_url']) img_store.storage_type = 'fs' sm = SMapiDatasetManager(db=db, es=ESExporter(db), image_store=img_store, mode='queue') ds_opt_img_query = 'SELECT optical_image from dataset {}'.format( 'WHERE id = %s') img_id = db.select(ds_opt_img_query, params=(ds.id, )) sm._add_thumbnail_optical_image(ds, f"{img_id[0][0]}", transform)
def _reindex_all(conf): es_config = conf['elasticsearch'] alias = es_config['index'] es_man = ESIndexManager(es_config) new_index = es_man.another_index_name(es_man.internal_index_name(alias)) es_man.create_index(new_index) try: tmp_es_config = deepcopy(es_config) tmp_es_config['index'] = new_index db = DB(conf['db']) es_exp = ESExporter(db, tmp_es_config) rows = db.select('select id, name, config from dataset') _reindex_datasets(rows, es_exp) es_man.remap_alias(tmp_es_config['index'], alias=alias) except Exception as e: es_man.delete_index(new_index) raise e
def reindex_results(ds_id, ds_mask): assert ds_id or ds_mask conf = SMConfig.get_conf() if ds_mask == '_all_': _reindex_all(conf) else: db = DB(conf['db']) es_exp = ESExporter(db) if ds_id: rows = db.select( "select id, name, config from dataset where id = '{}'".format( ds_id)) elif ds_mask: rows = db.select( "select id, name, config from dataset where name like '{}%'". format(ds_mask)) else: rows = [] _reindex_datasets(rows, es_exp)
def _reindex_all(sm_config): es_config = sm_config['elasticsearch'] alias = es_config['index'] es_man = ESIndexManager(es_config) old_index = es_man.internal_index_name(alias) new_index = es_man.another_index_name(old_index) es_man.create_index(new_index) try: inactive_es_config = get_inactive_index_es_config(es_config) db = DB() es_exp = ESExporter(db, { **sm_config, 'elasticsearch': inactive_es_config }) ds_ids = [r[0] for r in db.select('select id from dataset')] _reindex_datasets(ds_ids, es_exp) es_man.remap_alias(inactive_es_config['index'], alias=alias) except Exception as e: es_man.delete_index(new_index) raise e else: es_man.delete_index(old_index)
def test_update_ds_works_for_all_fields(sm_config, test_db, es, sm_index, es_dsl_search): update = { 'name': 'new_ds_name', 'submitter_id': 'new_ds_submitter_id', 'group_id': 'new_ds_group_id', 'projects_ids': ['proj_id1', 'proj_id2'], 'is_public': True, } index = sm_config['elasticsearch']['index'] es.create( index=index, doc_type='annotation', id='id1', body={ 'ds_id': 'dataset1', 'ds_name': 'ds_name', 'ds_submitter_id': 'ds_submitter', 'ds_group_id': 'ds_group_id', 'ds_project_ids': [], 'ds_is_public': False, }, ) es.create( index=index, doc_type='dataset', id='dataset1', body={ 'ds_id': 'dataset1', 'ds_name': 'ds_name', 'ds_submitter_id': 'ds_submitter_id', 'ds_group_id': 'ds_group_id', 'ds_projects_ids': [], 'ds_is_public': False, }, ) wait_for_es(es, index) db_mock = MagicMock(spec=DB) db_mock.select_with_fields.return_value = [{ 'ds_name': 'new_ds_name', 'ds_submitter_id': 'new_ds_submitter_id', 'ds_submitter_name': 'submitter_name', 'ds_submitter_email': 'submitter_email', 'ds_group_id': 'new_ds_group_id', 'ds_group_name': 'group_name', 'ds_group_approved': True, 'ds_group_short_name': 'group_short_name', 'ds_projects_ids': ['proj_id1', 'proj_id2'], 'ds_is_public': True, }] es_exporter = ESExporter(db_mock, sm_config) es_exporter.update_ds('dataset1', fields=list(update.keys())) wait_for_es(es, index) ds_doc = (es_dsl_search.filter( 'term', _type='dataset').execute().to_dict()['hits']['hits'][0]['_source']) for k, v in update.items(): assert v == ds_doc[f'ds_{k}'] ann_doc = (es_dsl_search.filter( 'term', _type='annotation').execute().to_dict()['hits']['hits'][0]['_source']) for k, v in update.items(): assert v == ann_doc[f'ds_{k}']
def test_index_ds_works(es_dsl_search, sm_index, sm_config): ds_id = '2000-01-01_00h00m' upload_dt = datetime.now().isoformat(' ') mol_db_id = 0 last_finished = '2017-01-01T00:00:00' def db_sel_side_effect(sql, params): if sql == DATASET_SEL: return [{ 'ds_id': ds_id, 'ds_name': 'ds_name', 'ds_input_path': 'ds_input_path', 'ds_config': 'ds_config', 'ds_meta': {}, 'ds_upload_dt': upload_dt, 'ds_status': 'ds_status', 'ds_last_finished': datetime.strptime(last_finished, '%Y-%m-%dT%H:%M:%S'), 'ds_is_public': True, 'ds_ion_img_storage': 'fs', 'ds_acq_geometry': {} }] elif sql == ANNOTATIONS_SEL: return [{ 'sf': 'H2O', 'sf_adduct': 'H2O+H', 'chaos': 1, 'image_corr': 1, 'pattern_match': 1, 'total_iso_ints': 100, 'min_iso_ints': 0, 'max_iso_ints': 100, 'msm': 1, 'adduct': '+H', 'job_id': 1, 'fdr': 0.1, 'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'], 'polarity': '+' }, { 'sf': 'Au', 'sf_adduct': 'Au+H', 'chaos': 1, 'image_corr': 1, 'pattern_match': 1, 'total_iso_ints': 100, 'min_iso_ints': 0, 'max_iso_ints': 100, 'msm': 1, 'adduct': '+H', 'job_id': 1, 'fdr': 0.05, 'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'], 'polarity': '+' }] else: logging.getLogger('engine').error('Wrong db_sel_side_effect arguments: ', args) db_mock = MagicMock(spec=DB) db_mock.select_with_fields.side_effect = db_sel_side_effect mol_db_mock = MagicMock(MolecularDB) mol_db_mock.id = mol_db_id mol_db_mock.name = 'db_name' mol_db_mock.version = '2017' mol_db_mock.get_molecules.return_value = pd.DataFrame([('H2O', 'mol_id', 'mol_name'), ('Au', 'mol_id', 'mol_name')], columns=['sf', 'mol_id', 'mol_name']) isocalc_mock = MagicMock(IsocalcWrapper) isocalc_mock.ion_centroids = lambda sf, adduct: { ('H2O', '+H'): ([100., 200.], None), ('Au', '+H'): ([10., 20.], None) }[(sf, adduct)] es_exp = ESExporter(db_mock) es_exp.delete_ds(ds_id) es_exp.index_ds(ds_id=ds_id, mol_db=mol_db_mock, isocalc=isocalc_mock) wait_for_es(sec=1) ds_d = es_dsl_search.filter('term', _type='dataset').execute().to_dict()['hits']['hits'][0]['_source'] assert ds_d == { 'ds_last_finished': last_finished, 'ds_config': 'ds_config', 'ds_meta': {}, 'ds_status': 'ds_status', 'ds_name': 'ds_name', 'ds_input_path': 'ds_input_path', 'ds_id': ds_id, 'ds_upload_dt': upload_dt, 'annotation_counts': [{'db': {'name': 'db_name', 'version': '2017'}, 'counts': [{'level': 5, 'n': 1}, {'level': 10, 'n': 2}, {'level': 20, 'n': 2}, {'level': 50, 'n': 2}]}], 'ds_is_public': True, 'ds_acq_geometry': {}, 'ds_ion_img_storage': 'fs' } ann_1_d = es_dsl_search.filter('term', sf='H2O').execute().to_dict()['hits']['hits'][0]['_source'] assert ann_1_d == { 'pattern_match': 1, 'image_corr': 1, 'fdr': 0.1, 'chaos': 1, 'sf': 'H2O', 'min_iso_ints': 0, 'msm': 1, 'sf_adduct': 'H2O+H', 'total_iso_ints': 100, 'centroid_mzs': [100., 200.], 'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'], 'polarity': '+', 'job_id': 1, 'max_iso_ints': 100, 'adduct': '+H', 'ds_name': 'ds_name', 'annotation_counts': [], 'db_version': '2017', 'ds_status': 'ds_status', 'ion_add_pol': '[M+H]+', 'comp_names': ['mol_name'], 'db_name': 'db_name', 'mz': 100., 'ds_meta': {}, 'comp_ids': ['mol_id'], 'ds_config': 'ds_config', 'ds_input_path': 'ds_input_path', 'ds_id': ds_id, 'ds_upload_dt': upload_dt, 'ds_last_finished': last_finished, 'ds_ion_img_storage': 'fs', 'ds_is_public': True } ann_2_d = es_dsl_search.filter('term', sf='Au').execute().to_dict()['hits']['hits'][0]['_source'] assert ann_2_d == { 'pattern_match': 1, 'image_corr': 1, 'fdr': 0.05, 'chaos': 1, 'sf': 'Au', 'min_iso_ints': 0, 'msm': 1, 'sf_adduct': 'Au+H', 'total_iso_ints': 100, 'centroid_mzs': [10., 20.], 'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'], 'polarity': '+', 'job_id': 1, 'max_iso_ints': 100, 'adduct': '+H', 'ds_name': 'ds_name', 'annotation_counts': [], 'db_version': '2017', 'ds_status': 'ds_status', 'ion_add_pol': '[M+H]+', 'comp_names': ['mol_name'], 'db_name': 'db_name', 'mz': 10., 'ds_meta': {}, 'comp_ids': ['mol_id'], 'ds_config': 'ds_config', 'ds_input_path': 'ds_input_path', 'ds_id': ds_id, 'ds_upload_dt': upload_dt, 'ds_last_finished': last_finished, 'ds_ion_img_storage': 'fs', 'ds_is_public': True }
def run(self, ds): """ Entry point of the engine. Molecule search is completed in several steps: * Copying input data to the engine work dir * Conversion input mass spec files to plain text format. One line - one spectrum data * Generation and saving to the database theoretical peaks for all formulas from the molecule database * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner. * Saving results (isotope images and their metrics of quality for each putative molecule) to the database Args ---- ds : sm.engine.dataset_manager.Dataset """ try: logger.info('*' * 150) start = time.time() self._init_db() self._es = ESExporter(self._db) self._ds = ds if self._sm_config['rabbitmq']: self._status_queue = QueuePublisher( config=self._sm_config['rabbitmq'], qdesc=SM_DS_STATUS, logger=logger) else: self._status_queue = None self._wd_manager = WorkDirManager(ds.id) self._configure_spark() if not self.no_clean: self._wd_manager.clean() self._ds_reader = DatasetReader(self._ds.input_path, self._sc, self._wd_manager) self._ds_reader.copy_convert_input_data() self._save_data_from_raw_ms_file() self._img_store.storage_type = self._ds.get_ion_img_storage_type( self._db) logger.info('Dataset config:\n%s', pformat(self._ds.config)) completed_moldb_ids, new_moldb_ids = self._moldb_ids() for moldb_id in completed_moldb_ids.symmetric_difference( new_moldb_ids): # ignore ids present in both sets mol_db = MolecularDB( id=moldb_id, db=self._db, iso_gen_config=self._ds.config['isotope_generation']) if moldb_id not in new_moldb_ids: self._remove_annotation_job(mol_db) elif moldb_id not in completed_moldb_ids: self._run_annotation_job(mol_db) logger.info("All done!") time_spent = time.time() - start logger.info('Time spent: %d mins %d secs', *divmod(int(round(time_spent)), 60)) finally: if self._sc: self._sc.stop() if self._db: self._db.close() if self._wd_manager and not self.no_clean: self._wd_manager.clean() logger.info('*' * 150)
type=str, help='SM config path') args = parser.parse_args() SMConfig.set_path(args.config_path) sm_config = SMConfig.get_conf() init_loggers(sm_config['logs']) logger = logging.getLogger(f'{args.name}-daemon') logger.info(f'Starting {args.name}-daemon') db = DB(sm_config['db']) status_queue_pub = QueuePublisher(config=sm_config['rabbitmq'], qdesc=SM_DS_STATUS, logger=logger) manager = SMDaemonManager(db=db, es=ESExporter(db), img_store=ImageStoreServiceWrapper( sm_config['services']['img_service_url']), status_queue=status_queue_pub, logger=logger) if args.name == 'annotate': daemon = SMAnnotateDaemon(manager=manager, annot_qdesc=SM_ANNOTATE, upd_qdesc=SM_UPDATE) elif args.name == 'update': daemon = SMUpdateDaemon(manager=manager, update_qdesc=SM_UPDATE) else: raise Exception(f'Wrong SM daemon name: {args.name}') signal.signal(signal.SIGINT, lambda *args: daemon.stop()) signal.signal(signal.SIGTERM, lambda *args: daemon.stop())
def test_index_ds_works(es_dsl_search, sm_index, sm_config): ds_id = '2000-01-01_00h00m' upload_dt = datetime.now().isoformat(' ') mol_db_id = 0 last_finished = '2017-01-01T00:00:00' def db_sel_side_effect(sql, params): if sql == DATASET_SEL: return [{ 'ds_id': ds_id, 'ds_name': 'ds_name', 'ds_input_path': 'ds_input_path', 'ds_config': 'ds_config', 'ds_meta': {}, 'ds_upload_dt': upload_dt, 'ds_status': 'ds_status', 'ds_last_finished': datetime.strptime(last_finished, '%Y-%m-%dT%H:%M:%S'), 'ds_is_public': True, 'ds_ion_img_storage': 'fs', 'ds_acq_geometry': {} }] elif sql == ANNOTATIONS_SEL: return [{ 'sf': 'H2O', 'sf_adduct': 'H2O+H', 'chaos': 1, 'image_corr': 1, 'pattern_match': 1, 'total_iso_ints': 100, 'min_iso_ints': 0, 'max_iso_ints': 100, 'msm': 1, 'adduct': '+H', 'job_id': 1, 'fdr': 0.1, 'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'], 'polarity': '+' }, { 'sf': 'Au', 'sf_adduct': 'Au+H', 'chaos': 1, 'image_corr': 1, 'pattern_match': 1, 'total_iso_ints': 100, 'min_iso_ints': 0, 'max_iso_ints': 100, 'msm': 1, 'adduct': '+H', 'job_id': 1, 'fdr': 0.05, 'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'], 'polarity': '+' }] else: logging.getLogger('engine').error('Wrong db_sel_side_effect arguments: ', args) db_mock = MagicMock(spec=DB) db_mock.select_with_fields.side_effect = db_sel_side_effect mol_db_mock = MagicMock(MolecularDB) mol_db_mock.id = mol_db_id mol_db_mock.name = 'db_name' mol_db_mock.version = '2017' mol_db_mock.get_molecules.return_value = pd.DataFrame([('H2O', 'mol_id', 'mol_name'), ('Au', 'mol_id', 'mol_name')], columns=['sf', 'mol_id', 'mol_name']) isocalc_mock = MagicMock(IsocalcWrapper) isocalc_mock.ion_centroids = lambda sf, adduct: { ('H2O', '+H'): ([100., 200.], None), ('Au', '+H'): ([10., 20.], None) }[(sf, adduct)] es_exp = ESExporter(db_mock) es_exp.delete_ds(ds_id) es_exp.index_ds(ds_id=ds_id, mol_db=mol_db_mock, isocalc=isocalc_mock) wait_for_es(sec=1) ds_d = es_dsl_search.filter('term', _type='dataset').execute().to_dict()['hits']['hits'][0]['_source'] assert ds_d == { 'ds_last_finished': last_finished, 'ds_config': 'ds_config', 'ds_meta': {}, 'ds_status': 'ds_status', 'ds_name': 'ds_name', 'ds_input_path': 'ds_input_path', 'ds_id': ds_id, 'ds_upload_dt': upload_dt, 'annotation_counts': [{'db': {'name': 'db_name', 'version': '2017'}, 'counts': [{'level': 5, 'n': 1}, {'level': 10, 'n': 2}, {'level': 20, 'n': 2}, {'level': 50, 'n': 2}]}], 'ds_is_public': True, 'ds_acq_geometry': {}, 'ds_ion_img_storage': 'fs' } ann_1_d = es_dsl_search.filter('term', sf='H2O').execute().to_dict()['hits']['hits'][0]['_source'] assert ann_1_d == { 'pattern_match': 1, 'image_corr': 1, 'fdr': 0.1, 'chaos': 1, 'sf': 'H2O', 'min_iso_ints': 0, 'msm': 1, 'sf_adduct': 'H2O+H', 'total_iso_ints': 100, 'centroid_mzs': [100., 200.], 'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'], 'polarity': '+', 'job_id': 1, 'max_iso_ints': 100, 'adduct': '+H', 'ds_name': 'ds_name', 'annotation_counts': [], 'db_version': '2017', 'ds_status': 'ds_status', 'ion_add_pol': '[M+H]+', 'comp_names': ['mol_name'], 'db_name': 'db_name', 'mz': 100., 'ds_meta': {}, 'comp_ids': ['mol_id'], 'ds_config': 'ds_config', 'ds_input_path': 'ds_input_path', 'ds_id': ds_id, 'ds_upload_dt': upload_dt, 'ds_last_finished': last_finished, 'ds_ion_img_storage': 'fs', 'ds_is_public': True } ann_2_d = es_dsl_search.filter('term', sf='Au').execute().to_dict()['hits']['hits'][0]['_source'] assert ann_2_d == { 'pattern_match': 1, 'image_corr': 1, 'fdr': 0.05, 'chaos': 1, 'sf': 'Au', 'min_iso_ints': 0, 'msm': 1, 'sf_adduct': 'Au+H', 'total_iso_ints': 100, 'centroid_mzs': [10., 20.], 'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'], 'polarity': '+', 'job_id': 1, 'max_iso_ints': 100, 'adduct': '+H', 'ds_name': 'ds_name', 'annotation_counts': [], 'db_version': '2017', 'ds_status': 'ds_status', 'ion_add_pol': '[M+H]+', 'comp_names': ['mol_name'], 'db_name': 'db_name', 'mz': 10., 'ds_meta': {}, 'comp_ids': ['mol_id'], 'ds_config': 'ds_config', 'ds_input_path': 'ds_input_path', 'ds_id': ds_id, 'ds_upload_dt': upload_dt, 'ds_last_finished': last_finished, 'ds_ion_img_storage': 'fs', 'ds_is_public': True }
help='Path to a dataset location') parser.add_argument( '--no-clean', dest='no_clean', action='store_true', help="Don't clean dataset txt files after job is finished") parser.add_argument('--config', dest='sm_config_path', default='conf/config.json', type=str, help='SM config path') args = parser.parse_args() SMConfig.set_path(args.sm_config_path) sm_config = SMConfig.get_conf() init_loggers(sm_config['logs']) db = DB(sm_config['db']) img_store = ImageStoreServiceWrapper( sm_config['services']['img_service_url']) manager = SMDaemonManager(db, ESExporter(db), img_store) try: ds = create_ds_from_files(args.ds_id, args.ds_name, args.input_path) manager.annotate(ds, SearchJob, del_first=True) except Exception as e: logging.getLogger('engine').error(e, exc_info=True) sys.exit(1) sys.exit()
def test_index_ds_works(sm_config, test_db, es, es_dsl_search, sm_index, ds_config, metadata, annotation_stats): ds_id = '2000-01-01_00h00m' upload_dt = datetime.now().isoformat() last_finished = '2017-01-01 00:00:00' iso_image_ids = ['iso_img_id_1', 'iso_img_id_2'] stats = json.dumps(annotation_stats) db = DB() db.insert( "INSERT INTO dataset(id, name, input_path, config, metadata, upload_dt, status, " "status_update_dt, is_public, acq_geometry, ion_thumbnail) " "VALUES (%s, 'ds_name', 'ds_input_path', %s, %s, %s, 'ds_status', %s, true, '{}', %s)", [[ ds_id, json.dumps(ds_config), json.dumps(metadata), upload_dt, upload_dt, 'thumb-id' ]], ) moldb = create_test_molecular_db() (job_id, ) = db.insert_return( "INSERT INTO job(ds_id, moldb_id, status, start, finish) " "VALUES (%s, %s, 'job_status', %s, %s) RETURNING id", rows=[(ds_id, moldb.id, last_finished, last_finished)], ) (user_id, ) = db.insert_return( "INSERT INTO graphql.user (email, name, role) " "VALUES ('email', 'user_name', 'user') RETURNING id", [[]], ) (group_id, ) = db.insert_return( "INSERT INTO graphql.group (name, short_name) VALUES ('group name', 'grp') RETURNING id", [[]], ) db.insert( "INSERT INTO graphql.dataset(id, user_id, group_id) VALUES (%s, %s, %s)", [[ds_id, user_id, group_id]], ) ion_id1, ion_id2 = db.insert_return( "INSERT INTO graphql.ion(ion, formula, chem_mod, neutral_loss, adduct, charge, ion_formula) " "VALUES (%s, %s, %s, %s, %s, %s, %s) RETURNING id", [ ['H2O-H+O-H+H', 'H2O', '-H+O', '-H', '+H', 1, 'HO2'], ['Au+H', 'Au', '', '', '+H', 1, 'HAu'], ], ) db.insert( "INSERT INTO annotation(job_id, formula, chem_mod, neutral_loss, adduct, " "msm, fdr, stats, iso_image_ids, ion_id) " "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", [ [ job_id, 'H2O', '-H+O', '-H', '+H', 1, 0.1, stats, iso_image_ids, ion_id1 ], [ job_id, 'Au', '', '', '+H', 1, 0.05, stats, iso_image_ids, ion_id2 ], ], ) isocalc_mock = MagicMock(IsocalcWrapper) isocalc_mock.centroids = lambda formula: { 'H2O+H': ([100.0, 200.0], None), 'H2O-H+O-H+H': ([100.0, 200.0, 300.0], None), 'Au+H': ([10.0, 20.0], None), }[formula] isocalc_mock.mass_accuracy_bounds = lambda mzs: (mzs, mzs) with patch( 'sm.engine.es_export.molecular_db.fetch_molecules', return_value=pd.DataFrame( [('H2O', 'mol_id', 'mol_name'), ('Au', 'mol_id', 'mol_name')], columns=['formula', 'mol_id', 'mol_name'], ), ): es_exp = ESExporter(db, sm_config) es_exp.delete_ds(ds_id) es_exp.index_ds( ds_id=ds_id, moldb=moldb, isocalc=isocalc_mock, ) wait_for_es(es, sm_config['elasticsearch']['index']) ds_d = (es_dsl_search.filter( 'term', _type='dataset').execute().to_dict()['hits']['hits'][0]['_source']) expected_ds_fields = { 'ds_last_finished': last_finished, 'ds_config': ds_config, 'ds_adducts': ds_config['isotope_generation']['adducts'], 'ds_moldb_ids': ds_config['database_ids'], 'ds_chem_mods': [], 'ds_neutral_losses': [], 'ds_project_ids': [], 'ds_project_names': [], 'ds_meta': metadata, 'ds_status': 'ds_status', 'ds_status_update_dt': upload_dt, 'ds_name': 'ds_name', 'ds_input_path': 'ds_input_path', 'ds_id': ds_id, 'ds_upload_dt': upload_dt, 'ds_is_public': True, 'ds_submitter_email': 'email', 'ds_submitter_id': user_id, 'ds_submitter_name': 'user_name', 'ds_group_approved': False, 'ds_group_id': group_id, 'ds_group_name': 'group name', 'ds_group_short_name': 'grp', } assert ds_d == { **expected_ds_fields, 'ds_acq_geometry': {}, 'annotation_counts': [{ 'db': { 'id': moldb.id, 'name': moldb.name }, 'counts': [ { 'level': 5, 'n': 1 }, { 'level': 10, 'n': 2 }, { 'level': 20, 'n': 2 }, { 'level': 50, 'n': 2 }, ], }], } ann_1_d = (es_dsl_search.filter( 'term', formula='H2O').execute().to_dict()['hits']['hits'][0]['_source']) top_level_stats = { 'pattern_match': annotation_stats['spectral'], 'image_corr': annotation_stats['spatial'], 'chaos': annotation_stats['chaos'], **{ key: value for key, value in annotation_stats.items() if key in NON_METRIC_STATS }, } metrics = { key: value for key, value in annotation_stats.items() if key not in NON_METRIC_STATS } assert ann_1_d == { **expected_ds_fields, **top_level_stats, 'metrics': metrics, 'fdr': 0.1, 'formula': 'H2O', 'msm': 1.0, 'ion': 'H2O-H+O-H+H+', 'ion_formula': 'HO2', 'centroid_mzs': [100.0, 200.0, 300.0], 'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'], 'iso_image_urls': [ f'http://localhost:9000/{sm_config["image_storage"]["bucket"]}/iso/{ds_id}/iso_img_id_1', f'http://localhost:9000/{sm_config["image_storage"]["bucket"]}/iso/{ds_id}/iso_img_id_2', ], 'isobars': [], 'isomer_ions': [], 'polarity': '+', 'job_id': 1, 'adduct': '+H', 'neutral_loss': '-H', 'chem_mod': '-H+O', 'annotation_counts': [], 'comp_names': ['mol_name'], 'comps_count_with_isomers': 1, 'db_id': moldb.id, 'db_name': moldb.name, 'db_version': moldb.version, 'mz': 100.0, 'comp_ids': ['mol_id'], 'annotation_id': 1, 'off_sample_label': None, 'off_sample_prob': None, } ann_2_d = (es_dsl_search.filter( 'term', formula='Au').execute().to_dict()['hits']['hits'][0]['_source']) assert ann_2_d == { **expected_ds_fields, **top_level_stats, 'metrics': metrics, 'fdr': 0.05, 'formula': 'Au', 'msm': 1.0, 'ion': 'Au+H+', 'ion_formula': 'HAu', 'centroid_mzs': [10.0, 20.0], 'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'], 'iso_image_urls': [ f'http://localhost:9000/{sm_config["image_storage"]["bucket"]}/iso/{ds_id}/iso_img_id_1', f'http://localhost:9000/{sm_config["image_storage"]["bucket"]}/iso/{ds_id}/iso_img_id_2', ], 'isobars': [], 'isomer_ions': [], 'polarity': '+', 'job_id': 1, 'adduct': '+H', 'neutral_loss': '', 'chem_mod': '', 'annotation_counts': [], 'comp_names': ['mol_name'], 'comps_count_with_isomers': 1, 'db_id': moldb.id, 'db_name': moldb.name, 'db_version': moldb.version, 'mz': 10.0, 'comp_ids': ['mol_id'], 'annotation_id': 2, 'off_sample_label': None, 'off_sample_prob': None, }
def test_sm_daemon_es_export_fails( MSMSearchMock, post_images_to_image_store_mock, test_db, es_dsl_search, clean_isotope_storage, reset_queues, metadata, ds_config, queue_pub, local_sm_config, ): moldb = init_moldb() formula_metrics_df = pd.DataFrame({ 'formula_i': [0, 1, 2], 'ion_formula': ['C12H24O-H2O+H', 'C12H24O-H2+O2-CO+Na', 'C12H24O+K'], 'formula': ['C12H24O', 'C12H24O', 'C12H24O'], 'modifier': ['-H2O+H', '-H2+O2-CO+Na', '+K'], 'chem_mod': ['', '-H2+O2', ''], 'neutral_loss': ['-H2O', '-CO', ''], 'adduct': ['+H', '+Na', '+K'], 'chaos': [0.9, 0.9, 0.9], 'spatial': [0.9, 0.9, 0.9], 'spectral': [0.9, 0.9, 0.9], 'msm': [0.9**3, 0.9**3, 0.9**3], 'total_iso_ints': [[100.0], [100.0], [100.0]], 'min_iso_ints': [[0], [0], [0]], 'max_iso_ints': [[10.0], [10.0], [10.0]], 'fdr': [0.1, 0.1, 0.1], }).set_index('formula_i') search_algo_mock = MSMSearchMock() search_algo_mock.search.return_value = [ (formula_metrics_df, [], create_test_fdr_diagnostics_bundle()) ] search_algo_mock.metrics = OrderedDict([ ('chaos', 0), ('spatial', 0), ('spectral', 0), ('msm', 0), ('total_iso_ints', []), ('min_iso_ints', []), ('max_iso_ints', []), ]) image_ids = ['iso_image_1', None, None, None] post_images_to_image_store_mock.return_value = { 0: image_ids, 1: image_ids, 2: image_ids } db = DB() def throw_exception_function(*args, **kwargs): raise Exception('Test') es = ESExporter(db, local_sm_config) es.index_ds = throw_exception_function ds = create_test_ds( name=test_ds_name, input_path=input_dir_path, config={ **ds_config, 'database_ids': [moldb.id] }, status=DatasetStatus.QUEUED, es=es, ) queue_pub.publish({ 'ds_id': ds.id, 'ds_name': test_ds_name, 'action': DaemonAction.ANNOTATE }) run_daemons(db, es, local_sm_config) # dataset and job tables asserts row = db.select_one('SELECT status from job') assert row[0] == 'FINISHED' row = db.select_one('SELECT status from dataset') assert row[0] == 'FAILED'
def test_sm_daemons(get_compute_img_metrics_mock, filter_sf_metrics_mock, post_images_to_annot_service_mock, MolDBServiceWrapperMock, sm_config, test_db, es_dsl_search, clean_isotope_storage): init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock) get_compute_img_metrics_mock.return_value = lambda *args: (0.9, 0.9, 0.9, [100.], [0], [10.]) filter_sf_metrics_mock.side_effect = lambda x: x url_dict = { 'iso_image_ids': ['iso_image_1', None, None, None] } post_images_to_annot_service_mock.return_value = { 35: url_dict, 44: url_dict } db = DB(sm_config['db']) es = ESExporter(db) annotate_daemon = None update_daemon = None try: ds_config_str = open(ds_config_path).read() upload_dt = datetime.now() ds_id = '2000-01-01_00h00m' db.insert(Dataset.DS_INSERT, [{ 'id': ds_id, 'name': test_ds_name, 'input_path': input_dir_path, 'upload_dt': upload_dt, 'metadata': '{"Data_Type": "Imaging MS"}', 'config': ds_config_str, 'status': DatasetStatus.QUEUED, 'is_public': True, 'mol_dbs': ['HMDB-v4'], 'adducts': ['+H', '+Na', '+K'], 'ion_img_storage': 'fs' }]) ds = Dataset.load(db, ds_id) queue_pub.publish({'ds_id': ds.id, 'ds_name': ds.name, 'action': 'annotate'}) run_daemons(db, es) # dataset table asserts rows = db.select('SELECT id, name, input_path, upload_dt, status from dataset') input_path = join(dirname(__file__), 'data', test_ds_name) assert len(rows) == 1 assert rows[0] == (ds_id, test_ds_name, input_path, upload_dt, DatasetStatus.FINISHED) # ms acquisition geometry asserts rows = db.select('SELECT acq_geometry from dataset') assert len(rows) == 1 assert rows[0][0] == ds.get_acq_geometry(db) assert rows[0][0] == { ACQ_GEOMETRY_KEYS.LENGTH_UNIT: 'nm', ACQ_GEOMETRY_KEYS.AcqGridSection.section_name: { ACQ_GEOMETRY_KEYS.AcqGridSection.REGULAR_GRID: True, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_X : 3, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_Y : 3, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_X : 100, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_Y : 100 }, ACQ_GEOMETRY_KEYS.PixelSizeSection.section_name: { ACQ_GEOMETRY_KEYS.PixelSizeSection.REGULAR_SIZE: True, ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_X : 100, ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_Y : 100 } } # job table asserts rows = db.select('SELECT db_id, ds_id, status, start, finish from job') assert len(rows) == 1 db_id, ds_id, status, start, finish = rows[0] assert (db_id, ds_id, status) == (0, '2000-01-01_00h00m', JobStatus.FINISHED) assert start < finish # image metrics asserts rows = db.select(('SELECT db_id, sf, adduct, stats, iso_image_ids ' 'FROM iso_image_metrics ' 'ORDER BY sf, adduct')) assert rows[0] == (0, 'C12H24O', '+K', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9, 'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]}, ['iso_image_1', None, None, None]) assert rows[1] == (0, 'C12H24O', '+Na', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9, 'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]}, ['iso_image_1', None, None, None]) time.sleep(1) # Waiting for ES # ES asserts ds_docs = es_dsl_search.query('term', _type='dataset').execute().to_dict()['hits']['hits'] assert 1 == len(ds_docs) ann_docs = es_dsl_search.query('term', _type='annotation').execute().to_dict()['hits']['hits'] assert len(ann_docs) == len(rows) for doc in ann_docs: assert doc['_id'].startswith(ds_id) finally: db.close() if annotate_daemon: annotate_daemon.stop() if update_daemon: update_daemon.stop() with warn_only(): local('rm -rf {}'.format(data_dir_path))
import argparse from os.path import abspath import json from sm.engine.es_export import ESExporter if __name__ == '__main__': parser = argparse.ArgumentParser(description='Create ElasticSearch indices') parser.add_argument('--conf', default='conf/config.json', help="SM config path") parser.add_argument('--drop', action='store_true', help='Delete index if exists') args = parser.parse_args() name = 'sm' with open(abspath(args.conf)) as f: es_exp = ESExporter(json.load(f)) if args.drop: es_exp.delete_index(name) es_exp.create_index(name)
def test_delete_ds__completely(sm_config, test_db, es, sm_index): moldb = MolecularDB(0, 'HMDB', '2016') moldb2 = MolecularDB(1, 'ChEBI', '2016') index = sm_config['elasticsearch']['index'] es.create( index=index, doc_type='annotation', id='id1', body={ 'ds_id': 'dataset1', 'db_id': moldb.id, 'db_name': moldb.name, 'db_version': moldb.version, }, ) es.create( index=index, doc_type='annotation', id='id2', body={ 'ds_id': 'dataset1', 'db_id': moldb2.id, 'db_name': moldb2.name, 'db_version': moldb2.version, }, ) es.create( index=index, doc_type='annotation', id='id3', body={ 'ds_id': 'dataset2', 'db_id': moldb.id, 'db_name': moldb.name, 'db_version': moldb.version, }, ) es.create( index=index, doc_type='dataset', id='dataset1', body={ 'ds_id': 'dataset1', 'db_id': moldb.id, 'db_name': moldb.name, 'db_version': moldb.version, }, ) wait_for_es(es, index) db_mock = MagicMock(spec=DB) es_exporter = ESExporter(db_mock, sm_config) es_exporter.delete_ds(ds_id='dataset1') wait_for_es(es, index) body = {'query': {'bool': {'filter': []}}} body['query']['bool']['filter'] = [ { 'term': { 'ds_id': 'dataset1' } }, { 'term': { 'db_id': moldb.id } }, ] assert es.count(index=index, doc_type='annotation', body=body)['count'] == 0 body['query']['bool']['filter'] = [ { 'term': { 'ds_id': 'dataset1' } }, { 'term': { 'db_id': moldb2.id } }, ] assert es.count(index=index, doc_type='annotation', body=body)['count'] == 0 body['query']['bool']['filter'] = [ { 'term': { 'ds_id': 'dataset2' } }, { 'term': { 'db_id': moldb.id } }, ] assert es.count(index=index, doc_type='annotation', body=body)['count'] == 1 body['query']['bool']['filter'] = [ { 'term': { 'ds_id': 'dataset1' } }, { 'term': { '_type': 'dataset' } }, ] assert es.count(index=index, doc_type='dataset', body=body)['count'] == 0