def test_search_job_imzml_example_es_export_fails(get_compute_img_metrics_mock, filter_sf_metrics_mock, post_images_to_annot_service_mock, MolDBServiceWrapperMock, MolDBServiceWrapperMock2, sm_config, create_fill_sm_database, es_dsl_search, clean_isotope_storage): init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock) init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock2) get_compute_img_metrics_mock.return_value = lambda *args: (0.9, 0.9, 0.9, [100.], [0], [10.]) filter_sf_metrics_mock.side_effect = lambda x: x url_dict = { 'iso_image_ids': ['iso_image_1', None, None, None] } post_images_to_annot_service_mock.return_value = { 35: url_dict, 44: url_dict } db = DB(sm_config['db']) def throw_exception_function(*args): raise Exception('Test') try: ds_id = '2000-01-01_00h00m' upload_dt = datetime.now() ds_config_str = open(ds_config_path).read() db.insert(Dataset.DS_INSERT, [{ 'id': ds_id, 'name': test_ds_name, 'input_path': input_dir_path, 'upload_dt': upload_dt, 'metadata': '{}', 'config': ds_config_str, 'status': DatasetStatus.QUEUED, 'is_public': True, 'mol_dbs': ['HMDB-v4'], 'adducts': ['+H', '+Na', '+K'], 'ion_img_storage': 'fs' }]) with patch('sm.engine.search_job.ESExporter.index_ds') as index_ds_mock: index_ds_mock.side_effect = throw_exception_function img_store = ImageStoreServiceWrapper(sm_config['services']['img_service_url']) job = SearchJob(img_store=img_store) ds = Dataset.load(db, ds_id) job.run(ds) except ESExportFailedError as e: assert e # dataset table asserts row = db.select_one('SELECT status from dataset') assert row[0] == 'FAILED' else: raise AssertionError('ESExportFailedError should be raised') finally: db.close() with warn_only(): local('rm -rf {}'.format(data_dir_path))
def init_moldb(): db = DB() moldb = create_test_molecular_db() db.insert( "INSERT INTO molecule (mol_id, mol_name, formula, moldb_id) VALUES (%s, %s, %s, %s)", rows=[('HMDB0001', 'molecule name', 'C12H24O', moldb.id)], ) return moldb
def store_iso_img_rows(row_it): db = DB(db_config) try: rows = list(row_it) if rows: db.insert(SF_ISO_IMGS_INS, rows) finally: db.close()
def test_sm_daemons_annot_fails(get_compute_img_metrics_mock, filter_sf_metrics_mock, post_images_to_annot_service_mock, MolDBServiceWrapperMock, sm_config, test_db, es_dsl_search, clean_isotope_storage): init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock) def throw_exception_function(*args): raise Exception('Test') get_compute_img_metrics_mock.return_value = throw_exception_function filter_sf_metrics_mock.side_effect = lambda x: x url_dict = { 'iso_image_ids': ['iso_image_1', None, None, None] } post_images_to_annot_service_mock.return_value = { 35: url_dict, 44: url_dict } db = DB(sm_config['db']) es = ESExporter(db) annotate_daemon = None try: ds_id = '2000-01-01_00h00m' upload_dt = datetime.now() ds_config_str = open(ds_config_path).read() db.insert(Dataset.DS_INSERT, [{ 'id': ds_id, 'name': test_ds_name, 'input_path': input_dir_path, 'upload_dt': upload_dt, 'metadata': '{}', 'config': ds_config_str, 'status': DatasetStatus.QUEUED, 'is_public': True, 'mol_dbs': ['HMDB-v4'], 'adducts': ['+H', '+Na', '+K'], 'ion_img_storage': 'fs' }]) ds = Dataset.load(db, ds_id) queue_pub.publish({'ds_id': ds.id, 'ds_name': ds.name, 'action': 'annotate'}) run_daemons(db, es) # dataset and job tables asserts row = db.select_one('SELECT status from dataset') assert row[0] == 'FAILED' row = db.select_one('SELECT status from job') assert row[0] == 'FAILED' finally: db.close() if annotate_daemon: annotate_daemon.stop() with warn_only(): local('rm -rf {}'.format(data_dir_path))
def fill_test_db(create_test_db, drop_test_db): db_config = dict(database='sm_test', user='******', host='localhost', password='******') db = DB(db_config) try: db.alter('TRUNCATE dataset CASCADE') db.insert("INSERT INTO dataset VALUES (%s, %s, %s, %s, %s)", [(1, 'ds_name', '/ds_path', json.dumps({}), json.dumps({}))]) db.alter('TRUNCATE coordinates CASCADE') except: raise finally: db.close()
def fill_db(test_db, sm_config, ds_config): upload_dt = '2000-01-01 00:00:00' ds_id = '2000-01-01' metadata = {"meta": "data"} db = DB(sm_config['db']) db.insert(( 'INSERT INTO dataset (id, name, input_path, upload_dt, metadata, config, status, ' 'is_public, mol_dbs, adducts) ' 'VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'), rows=[(ds_id, 'ds_name', 'input_path', upload_dt, json.dumps(metadata), json.dumps(ds_config), DatasetStatus.FINISHED, True, ['HMDB-v4' ], ['+H', '+Na', '+K'])])
def fill_test_db(create_test_db, drop_test_db): db_config = dict(database='sm_test', user='******', host='localhost', password='******') db = DB(db_config) try: db.alter('TRUNCATE dataset CASCADE') db.insert("INSERT INTO dataset VALUES (%s, %s, %s, %s, %s)", [(1, 'ds_name', '/ds_path', json.dumps({}), json.dumps({}))]) db.alter('TRUNCATE coordinates CASCADE') except: raise finally: db.close()
def create_fill_sm_database(create_test_db, drop_test_db, sm_config): proj_dir_path = dirname(dirname(__file__)) local('psql -h localhost -U sm sm_test < {}'.format( join(proj_dir_path, 'scripts/create_schema.sql'))) db = DB(sm_config['db']) try: db.insert('INSERT INTO dataset VALUES (%s, %s, %s, %s, %s, %s)', [(0, 'name', 0, 'fpath', json.dumps({}), json.dumps({}))]) db.insert('INSERT INTO job VALUES (%s, %s, %s, %s, %s, %s, %s, %s)', [(0, 0, 0, '', 0, 0, None, None)]) except: raise finally: db.close()
def test_new_ds_saves_to_db(test_db, metadata, ds_config): db = DB() moldb = create_test_molecular_db() ds_config['database_ids'] = [moldb.id] ds = create_test_ds(config={**ds_config, 'database_ids': [moldb.id]}) ion_metrics_df = pd.DataFrame({ 'formula': ['H2O', 'H2O', 'CO2', 'CO2', 'H2SO4', 'H2SO4'], 'adduct': ['+H', '[M]+', '+H', '[M]+', '+H', '[M]+'], 'fdr': [0.05, 0.1, 0.05, 0.1, 0.05, 0.1], 'image_id': list(map(str, range(6))), }) (job_id, ) = db.insert_return( "INSERT INTO job (moldb_id, ds_id, status) VALUES (%s, %s, 'FINISHED') RETURNING id", rows=[(moldb.id, ds.id)], ) db.insert( 'INSERT INTO annotation(' ' job_id, formula, chem_mod, neutral_loss, adduct, msm, fdr, stats, iso_image_ids' ') ' "VALUES (%s, %s, '', '', %s, 1, %s, '{}', %s)", [(job_id, r.formula, r.adduct, r.fdr, [r.image_id]) for i, r in ion_metrics_df.iterrows()], ) with patch( 'sm.engine.postprocessing.colocalization.ImageStorage.get_ion_images_for_analysis' ) as get_ion_images_for_analysis_mock: get_ion_images_for_analysis_mock.side_effect = mock_get_ion_images_for_analysis Colocalization(db).run_coloc_job(ds) jobs = db.select('SELECT id, error, sample_ion_ids FROM graphql.coloc_job') annotations = db.select( 'SELECT coloc_ion_ids, coloc_coeffs FROM graphql.coloc_annotation') ions = db.select('SELECT id FROM graphql.ion') assert len(jobs) > 0 assert not any(job[1] for job in jobs) assert jobs[0][2] assert len(annotations) > 10 assert all(len(ann[0]) == len(ann[1]) for ann in annotations) assert len(ions) == len(ion_metrics_df)
def fill_db(test_db, metadata, ds_config): upload_dt = '2000-01-01 00:00:00' ds_id = '2000-01-01' db = DB() db.insert( ('INSERT INTO dataset (id, name, input_path, upload_dt, metadata, config, status, ' 'status_update_dt, is_public) ' 'VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)'), rows=[( ds_id, 'ds_name', 'input_path', upload_dt, json.dumps(metadata), json.dumps(ds_config), DatasetStatus.FINISHED, upload_dt, True, )], ) create_test_molecular_db()
def create_fill_test_db(create_test_db, drop_test_db): db_config = dict(database='sm_test', user='******', host='localhost', password='******') db = DB(db_config) try: db.alter('TRUNCATE formula_db CASCADE') db.insert('INSERT INTO formula_db VALUES (%s, %s, %s)', [(0, '2016-01-01', 'HMDB')]) db.insert('INSERT INTO formula VALUES (%s, %s, %s, %s, %s)', [(9, 0, '04138', 'Au', 'Gold')]) db.insert('INSERT INTO agg_formula VALUES (%s, %s, %s, %s, %s)', [(9, 0, 'Au', ['04138'], ['Gold'])]) db.alter('TRUNCATE theor_peaks CASCADE') db.insert('INSERT INTO theor_peaks VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', [(0, 9, '+H', 0.01, 1, 10000, [100, 200], [10, 1], [], [])]) except: raise finally: db.close()
def fill_db(test_db, sm_config, ds_config): upload_dt = '2000-01-01 00:00:00' ds_id = '2000-01-01' meta = {"meta": "data"} db = DB(sm_config['db']) db.insert( 'INSERT INTO dataset (id, name, input_path, upload_dt, metadata, config, ' 'status, is_public, mol_dbs, adducts) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', rows=[(ds_id, 'ds_name', 'input_path', upload_dt, json.dumps(meta), json.dumps(ds_config), 'FINISHED', True, ['HMDB-v4'], ['+H'])]) db.insert("INSERT INTO job (id, db_id, ds_id) VALUES (%s, %s, %s)", rows=[(0, 0, ds_id)]) db.insert("INSERT INTO sum_formula (id, db_id, sf) VALUES (%s, %s, %s)", rows=[(1, 0, 'H2O')]) db.insert(( "INSERT INTO iso_image_metrics (job_id, db_id, sf, adduct, iso_image_ids) " "VALUES (%s, %s, %s, %s, %s)"), rows=[(0, 0, 'H2O', '+H', ['iso_image_1_id', 'iso_image_2_id'])]) db.close()
def create_fill_sm_database(create_test_db, drop_test_db, sm_config): local('psql -h localhost -U sm sm_test < {}'.format(join(proj_dir_path, 'scripts/create_schema.sql'))) db = DB(sm_config['db']) try: db.insert('INSERT INTO formula_db VALUES (%s, %s, %s)', [(0, '2016-01-01', 'HMDB')]) db.insert('INSERT INTO formula VALUES (%s, %s, %s, %s, %s)', [(100, 0, '00001', 'compound_name', 'C12H24O')]) db.insert('INSERT INTO agg_formula VALUES (%s, %s, %s, %s, %s)', [(10007, 0, 'C12H24O', ['00001'], ['compound_name'])]) except: raise finally: db.close()
def create_fill_sm_database(create_test_db, drop_test_db, sm_config): local('psql -h localhost -U sm sm_test < {}'.format(join(proj_dir_path, 'scripts/create_schema.sql'))) db = DB(sm_config['db']) try: db.insert('INSERT INTO formula_db VALUES (%s, %s, %s)', [(0, '2016-01-01', 'HMDB')]) db.insert('INSERT INTO formula VALUES (%s, %s, %s, %s, %s)', [(100, 0, '00001', 'compound_name', 'C12H24O')]) db.insert('INSERT INTO agg_formula VALUES (%s, %s, %s, %s, %s)', [(10007, 0, 'C12H24O', ['00001'], ['compound_name'])]) except: raise finally: db.close()
def fill_db(test_db): db = DB() db.insert( 'INSERT INTO graphql.user (id, name, email) VALUES (%s, %s, %s)', [(USER_ID, 'name', '*****@*****.**')], ) db.insert( 'INSERT INTO graphql.group (id, name, short_name) VALUES (%s, %s, %s)', [(GROUP_ID, 'test-group', 'test-group')], ) db.insert( 'INSERT INTO graphql.dataset (id, user_id, group_id) VALUES (%s, %s, %s)', [('dataset id', USER_ID, GROUP_ID)], ) yield
class SearchJob(object): """ Main class responsible for molecule search. Uses other modules of the engine. Args ---------- ds_name : string A dataset short name """ def __init__(self, client_email, ds_name): self.sm_config = SMConfig.get_conf() self.client_email = client_email self.ds_name = ds_name self.ds_id = None self.job_id = None self.sc = None self.db = None self.ds = None self.fdr = None self.formulas = None self.ds_config = None self.wd_manager = None def _read_ds_config(self): with open(self.wd_manager.ds_config_path) as f: self.ds_config = json.load(f) def _configure_spark(self): logger.info('Configuring Spark') sconf = SparkConf() for prop, value in self.sm_config['spark'].iteritems(): if prop.startswith('spark.'): sconf.set(prop, value) if 'aws' in self.sm_config: sconf.set("spark.hadoop.fs.s3a.access.key", self.sm_config['aws']['aws_access_key_id']) sconf.set("spark.hadoop.fs.s3a.secret.key", self.sm_config['aws']['aws_secret_access_key']) sconf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") # sconf.set("spark.python.profile", "true") self.sc = SparkContext(master=self.sm_config['spark']['master'], conf=sconf, appName='SM engine') if not self.sm_config['spark']['master'].startswith('local'): self.sc.addPyFile(join(local_path(proj_root()), 'sm.zip')) def _init_db(self): logger.info('Connecting to the DB') self.db = DB(self.sm_config['db']) self.sf_db_id = self.db.select_one( DB_ID_SEL, self.ds_config['database']['name'])[0] def store_job_meta(self): """ Store search job metadata in the database """ logger.info('Storing job metadata') self.ds_id = int(self.db.select_one(DS_ID_SEL, self.ds_name)[0]) self.job_id = self.ds_id self.db.alter(DEL_JOB_SQL, self.job_id) rows = [(self.job_id, self.sf_db_id, self.ds_id, datetime.now().strftime('%Y-%m-%d %H:%M:%S'))] self.db.insert(JOB_INS, rows) rows = [(self.job_id, adduct) for adduct in self.ds_config['isotope_generation']['adducts']] self.db.insert(ADDUCT_INS, rows) def run(self, input_path, ds_config_path, clean=False): """ Entry point of the engine. Molecule search is completed in several steps: * Copying input data to the engine work dir * Conversion input data (imzML+ibd) to plain text format. One line - one spectrum data * Generation and saving to the database theoretical peaks for all formulas from the molecule database * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner. * Saving results (isotope images and their metrics of quality for each putative molecule) to the database Args ------- input_path : string Path to the dataset folder with .imzML and .ibd files ds_config_path: string Path to the dataset config file clean : bool Clean all interim data files before starting molecule search """ try: self.wd_manager = WorkDirManager(self.ds_name) if clean: self.wd_manager.clean() self.wd_manager.copy_input_data(input_path, ds_config_path) self._read_ds_config() logger.info('Dataset config:\n%s', pformat(self.ds_config)) self._configure_spark() self._init_db() if not self.wd_manager.exists(self.wd_manager.txt_path): imzml_converter = ImzmlTxtConverter( self.ds_name, self.wd_manager.local_dir.imzml_path, self.wd_manager.local_dir.txt_path, self.wd_manager.local_dir.coord_path) imzml_converter.convert() if not self.wd_manager.local_fs_only: self.wd_manager.upload_to_remote() self.ds = Dataset(self.sc, self.ds_name, self.client_email, input_path, self.ds_config, self.wd_manager, self.db) self.ds.save_ds_meta() self.store_job_meta() theor_peaks_gen = TheorPeaksGenerator(self.sc, self.sm_config, self.ds_config) theor_peaks_gen.run() target_adducts = self.ds_config['isotope_generation']['adducts'] self.fdr = FDR(self.job_id, self.sf_db_id, decoy_sample_size=20, target_adducts=target_adducts, db=self.db) self.fdr.decoy_adduct_selection() self.formulas = FormulasSegm(self.job_id, self.sf_db_id, self.ds_config, self.db) # search_alg = MSMBasicSearch(self.sc, self.ds, self.formulas, self.fdr, self.ds_config) search_alg = MSMExtraFeats(self.sc, self.ds, self.formulas, self.fdr, self.ds_config) sf_metrics_df, sf_iso_images = search_alg.search() search_results = SearchResults( self.sf_db_id, self.ds_id, self.job_id, self.ds_name, self.formulas.get_sf_adduct_peaksn(), self.db, self.sm_config, self.ds_config) search_results.sf_metrics_df = sf_metrics_df search_results.sf_iso_images = sf_iso_images search_results.metrics = search_alg.metrics search_results.nrows, search_results.ncols = self.ds.get_dims() search_results.store() es = ESExporter(self.sm_config) es.index_ds(self.db, self.ds_name, self.ds_config['database']['name']) except Exception: exc_type, exc_value, exc_traceback = sys.exc_info() logger.error('\n'.join( traceback.format_exception(exc_type, exc_value, exc_traceback))) finally: if self.sc: # self.sc.show_profiles() self.sc.stop() if self.db: self.db.close()
def add_diagnostics(diagnostics: List[DatasetDiagnostic]): """Upserts dataset diagnostics, overwriting existing values with the same ds_id, job_id, type""" # Validate input, as postgres can't enforce the JSON columns have the correct schema, # and many places (graphql, python client, etc.) rely on these structures. if not diagnostics: return for diagnostic in diagnostics: assert 'ds_id' in diagnostic assert 'type' in diagnostic images = diagnostic.get('images', []) assert all(image['key'] in DiagnosticImageKey for image in images) assert all(image['format'] in DiagnosticImageFormat for image in images) assert all(image['image_id'] in image['url'] for image in images) image_keys = set( (image.get('key'), image.get('index')) for image in images) assert len(image_keys) == len( images), 'diagnostic image keys should be unique' db = DB() # Find all diagnostics that should be replaced by the new diagnostics existing = db.select_with_fields( """ WITH new_diagnostic AS ( SELECT UNNEST(%s::text[]) as ds_id, UNNEST(%s::int[]) as job_id, UNNEST(%s::text[]) as type ) SELECT dd.ds_id, dd.id, dd.images FROM new_diagnostic nd JOIN dataset_diagnostic dd ON nd.ds_id = dd.ds_id AND (nd.job_id = dd.job_id OR (nd.job_id IS NULL AND dd.job_id is NULL)) AND nd.type = dd.type """, list( map( list, zip(*((d['ds_id'], d.get('job_id'), d['type']) for d in diagnostics)))), ) if existing: logger.debug( f'Deleting {len(existing)} existing diagnostics for dataset {existing[0]["ds_id"]}' ) # Delete existing images image_ids_by_ds = defaultdict(list) for row in existing: for img in row['images'] or []: image_ids_by_ds[row['ds_id']].append(img['image_id']) for ds_id, image_ids in image_ids_by_ds.items(): image_storage.delete_images(image_storage.DIAG, ds_id, image_ids) # Delete existing DB rows db.alter( 'DELETE FROM dataset_diagnostic WHERE id = ANY(%s::uuid[])', ([row['id'] for row in existing], ), ) logger.debug( f'Inserting {len(diagnostics)} diagnostics for dataset {diagnostics[0]["ds_id"]}' ) db.insert( 'INSERT INTO dataset_diagnostic (ds_id, job_id, type, updated_dt, data, error, images) ' 'VALUES (%s, %s, %s, %s, %s, %s, %s)', [( d['ds_id'], d.get('job_id'), d['type'], datetime.now(), numpy_json_dumps(d['data']) if d.get('data') is not None else None, d.get('error'), numpy_json_dumps(d.get('images', [])), ) for d in diagnostics], )
def test_index_ds_works(sm_config, test_db, es, es_dsl_search, sm_index, ds_config, metadata, annotation_stats): ds_id = '2000-01-01_00h00m' upload_dt = datetime.now().isoformat() last_finished = '2017-01-01 00:00:00' iso_image_ids = ['iso_img_id_1', 'iso_img_id_2'] stats = json.dumps(annotation_stats) db = DB() db.insert( "INSERT INTO dataset(id, name, input_path, config, metadata, upload_dt, status, " "status_update_dt, is_public, acq_geometry, ion_thumbnail) " "VALUES (%s, 'ds_name', 'ds_input_path', %s, %s, %s, 'ds_status', %s, true, '{}', %s)", [[ ds_id, json.dumps(ds_config), json.dumps(metadata), upload_dt, upload_dt, 'thumb-id' ]], ) moldb = create_test_molecular_db() (job_id, ) = db.insert_return( "INSERT INTO job(ds_id, moldb_id, status, start, finish) " "VALUES (%s, %s, 'job_status', %s, %s) RETURNING id", rows=[(ds_id, moldb.id, last_finished, last_finished)], ) (user_id, ) = db.insert_return( "INSERT INTO graphql.user (email, name, role) " "VALUES ('email', 'user_name', 'user') RETURNING id", [[]], ) (group_id, ) = db.insert_return( "INSERT INTO graphql.group (name, short_name) VALUES ('group name', 'grp') RETURNING id", [[]], ) db.insert( "INSERT INTO graphql.dataset(id, user_id, group_id) VALUES (%s, %s, %s)", [[ds_id, user_id, group_id]], ) ion_id1, ion_id2 = db.insert_return( "INSERT INTO graphql.ion(ion, formula, chem_mod, neutral_loss, adduct, charge, ion_formula) " "VALUES (%s, %s, %s, %s, %s, %s, %s) RETURNING id", [ ['H2O-H+O-H+H', 'H2O', '-H+O', '-H', '+H', 1, 'HO2'], ['Au+H', 'Au', '', '', '+H', 1, 'HAu'], ], ) db.insert( "INSERT INTO annotation(job_id, formula, chem_mod, neutral_loss, adduct, " "msm, fdr, stats, iso_image_ids, ion_id) " "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", [ [ job_id, 'H2O', '-H+O', '-H', '+H', 1, 0.1, stats, iso_image_ids, ion_id1 ], [ job_id, 'Au', '', '', '+H', 1, 0.05, stats, iso_image_ids, ion_id2 ], ], ) isocalc_mock = MagicMock(IsocalcWrapper) isocalc_mock.centroids = lambda formula: { 'H2O+H': ([100.0, 200.0], None), 'H2O-H+O-H+H': ([100.0, 200.0, 300.0], None), 'Au+H': ([10.0, 20.0], None), }[formula] isocalc_mock.mass_accuracy_bounds = lambda mzs: (mzs, mzs) with patch( 'sm.engine.es_export.molecular_db.fetch_molecules', return_value=pd.DataFrame( [('H2O', 'mol_id', 'mol_name'), ('Au', 'mol_id', 'mol_name')], columns=['formula', 'mol_id', 'mol_name'], ), ): es_exp = ESExporter(db, sm_config) es_exp.delete_ds(ds_id) es_exp.index_ds( ds_id=ds_id, moldb=moldb, isocalc=isocalc_mock, ) wait_for_es(es, sm_config['elasticsearch']['index']) ds_d = (es_dsl_search.filter( 'term', _type='dataset').execute().to_dict()['hits']['hits'][0]['_source']) expected_ds_fields = { 'ds_last_finished': last_finished, 'ds_config': ds_config, 'ds_adducts': ds_config['isotope_generation']['adducts'], 'ds_moldb_ids': ds_config['database_ids'], 'ds_chem_mods': [], 'ds_neutral_losses': [], 'ds_project_ids': [], 'ds_project_names': [], 'ds_meta': metadata, 'ds_status': 'ds_status', 'ds_status_update_dt': upload_dt, 'ds_name': 'ds_name', 'ds_input_path': 'ds_input_path', 'ds_id': ds_id, 'ds_upload_dt': upload_dt, 'ds_is_public': True, 'ds_submitter_email': 'email', 'ds_submitter_id': user_id, 'ds_submitter_name': 'user_name', 'ds_group_approved': False, 'ds_group_id': group_id, 'ds_group_name': 'group name', 'ds_group_short_name': 'grp', } assert ds_d == { **expected_ds_fields, 'ds_acq_geometry': {}, 'annotation_counts': [{ 'db': { 'id': moldb.id, 'name': moldb.name }, 'counts': [ { 'level': 5, 'n': 1 }, { 'level': 10, 'n': 2 }, { 'level': 20, 'n': 2 }, { 'level': 50, 'n': 2 }, ], }], } ann_1_d = (es_dsl_search.filter( 'term', formula='H2O').execute().to_dict()['hits']['hits'][0]['_source']) top_level_stats = { 'pattern_match': annotation_stats['spectral'], 'image_corr': annotation_stats['spatial'], 'chaos': annotation_stats['chaos'], **{ key: value for key, value in annotation_stats.items() if key in NON_METRIC_STATS }, } metrics = { key: value for key, value in annotation_stats.items() if key not in NON_METRIC_STATS } assert ann_1_d == { **expected_ds_fields, **top_level_stats, 'metrics': metrics, 'fdr': 0.1, 'formula': 'H2O', 'msm': 1.0, 'ion': 'H2O-H+O-H+H+', 'ion_formula': 'HO2', 'centroid_mzs': [100.0, 200.0, 300.0], 'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'], 'iso_image_urls': [ f'http://localhost:9000/{sm_config["image_storage"]["bucket"]}/iso/{ds_id}/iso_img_id_1', f'http://localhost:9000/{sm_config["image_storage"]["bucket"]}/iso/{ds_id}/iso_img_id_2', ], 'isobars': [], 'isomer_ions': [], 'polarity': '+', 'job_id': 1, 'adduct': '+H', 'neutral_loss': '-H', 'chem_mod': '-H+O', 'annotation_counts': [], 'comp_names': ['mol_name'], 'comps_count_with_isomers': 1, 'db_id': moldb.id, 'db_name': moldb.name, 'db_version': moldb.version, 'mz': 100.0, 'comp_ids': ['mol_id'], 'annotation_id': 1, 'off_sample_label': None, 'off_sample_prob': None, } ann_2_d = (es_dsl_search.filter( 'term', formula='Au').execute().to_dict()['hits']['hits'][0]['_source']) assert ann_2_d == { **expected_ds_fields, **top_level_stats, 'metrics': metrics, 'fdr': 0.05, 'formula': 'Au', 'msm': 1.0, 'ion': 'Au+H+', 'ion_formula': 'HAu', 'centroid_mzs': [10.0, 20.0], 'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'], 'iso_image_urls': [ f'http://localhost:9000/{sm_config["image_storage"]["bucket"]}/iso/{ds_id}/iso_img_id_1', f'http://localhost:9000/{sm_config["image_storage"]["bucket"]}/iso/{ds_id}/iso_img_id_2', ], 'isobars': [], 'isomer_ions': [], 'polarity': '+', 'job_id': 1, 'adduct': '+H', 'neutral_loss': '', 'chem_mod': '', 'annotation_counts': [], 'comp_names': ['mol_name'], 'comps_count_with_isomers': 1, 'db_id': moldb.id, 'db_name': moldb.name, 'db_version': moldb.version, 'mz': 10.0, 'comp_ids': ['mol_id'], 'annotation_id': 2, 'off_sample_label': None, 'off_sample_prob': None, }
def test_sm_daemons(get_compute_img_metrics_mock, filter_sf_metrics_mock, post_images_to_annot_service_mock, MolDBServiceWrapperMock, sm_config, test_db, es_dsl_search, clean_isotope_storage): init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock) get_compute_img_metrics_mock.return_value = lambda *args: (0.9, 0.9, 0.9, [100.], [0], [10.]) filter_sf_metrics_mock.side_effect = lambda x: x url_dict = { 'iso_image_ids': ['iso_image_1', None, None, None] } post_images_to_annot_service_mock.return_value = { 35: url_dict, 44: url_dict } db = DB(sm_config['db']) es = ESExporter(db) annotate_daemon = None update_daemon = None try: ds_config_str = open(ds_config_path).read() upload_dt = datetime.now() ds_id = '2000-01-01_00h00m' db.insert(Dataset.DS_INSERT, [{ 'id': ds_id, 'name': test_ds_name, 'input_path': input_dir_path, 'upload_dt': upload_dt, 'metadata': '{"Data_Type": "Imaging MS"}', 'config': ds_config_str, 'status': DatasetStatus.QUEUED, 'is_public': True, 'mol_dbs': ['HMDB-v4'], 'adducts': ['+H', '+Na', '+K'], 'ion_img_storage': 'fs' }]) ds = Dataset.load(db, ds_id) queue_pub.publish({'ds_id': ds.id, 'ds_name': ds.name, 'action': 'annotate'}) run_daemons(db, es) # dataset table asserts rows = db.select('SELECT id, name, input_path, upload_dt, status from dataset') input_path = join(dirname(__file__), 'data', test_ds_name) assert len(rows) == 1 assert rows[0] == (ds_id, test_ds_name, input_path, upload_dt, DatasetStatus.FINISHED) # ms acquisition geometry asserts rows = db.select('SELECT acq_geometry from dataset') assert len(rows) == 1 assert rows[0][0] == ds.get_acq_geometry(db) assert rows[0][0] == { ACQ_GEOMETRY_KEYS.LENGTH_UNIT: 'nm', ACQ_GEOMETRY_KEYS.AcqGridSection.section_name: { ACQ_GEOMETRY_KEYS.AcqGridSection.REGULAR_GRID: True, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_X : 3, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_Y : 3, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_X : 100, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_Y : 100 }, ACQ_GEOMETRY_KEYS.PixelSizeSection.section_name: { ACQ_GEOMETRY_KEYS.PixelSizeSection.REGULAR_SIZE: True, ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_X : 100, ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_Y : 100 } } # job table asserts rows = db.select('SELECT db_id, ds_id, status, start, finish from job') assert len(rows) == 1 db_id, ds_id, status, start, finish = rows[0] assert (db_id, ds_id, status) == (0, '2000-01-01_00h00m', JobStatus.FINISHED) assert start < finish # image metrics asserts rows = db.select(('SELECT db_id, sf, adduct, stats, iso_image_ids ' 'FROM iso_image_metrics ' 'ORDER BY sf, adduct')) assert rows[0] == (0, 'C12H24O', '+K', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9, 'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]}, ['iso_image_1', None, None, None]) assert rows[1] == (0, 'C12H24O', '+Na', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9, 'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]}, ['iso_image_1', None, None, None]) time.sleep(1) # Waiting for ES # ES asserts ds_docs = es_dsl_search.query('term', _type='dataset').execute().to_dict()['hits']['hits'] assert 1 == len(ds_docs) ann_docs = es_dsl_search.query('term', _type='annotation').execute().to_dict()['hits']['hits'] assert len(ann_docs) == len(rows) for doc in ann_docs: assert doc['_id'].startswith(ds_id) finally: db.close() if annotate_daemon: annotate_daemon.stop() if update_daemon: update_daemon.stop() with warn_only(): local('rm -rf {}'.format(data_dir_path))
def test_search_job_imzml_example(get_compute_img_metrics_mock, filter_sf_metrics_mock, post_images_to_annot_service_mock, MolDBServiceWrapperMock, MolDBServiceWrapperMock2, sm_config, create_fill_sm_database, es_dsl_search, clean_isotope_storage): init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock) init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock2) get_compute_img_metrics_mock.return_value = lambda *args: (0.9, 0.9, 0.9, [100.], [0], [10.]) filter_sf_metrics_mock.side_effect = lambda x: x url_dict = { 'iso_image_ids': ['iso_image_1', None, None, None] } post_images_to_annot_service_mock.return_value = { 35: url_dict, 44: url_dict } db = DB(sm_config['db']) try: ds_config_str = open(ds_config_path).read() upload_dt = datetime.now() ds_id = '2000-01-01_00h00m' db.insert(Dataset.DS_INSERT, [{ 'id': ds_id, 'name': test_ds_name, 'input_path': input_dir_path, 'upload_dt': upload_dt, 'metadata': '{}', 'config': ds_config_str, 'status': DatasetStatus.QUEUED, 'is_public': True, 'mol_dbs': ['HMDB-v4'], 'adducts': ['+H', '+Na', '+K'], 'ion_img_storage': 'fs' }]) img_store = ImageStoreServiceWrapper(sm_config['services']['img_service_url']) job = SearchJob(img_store=img_store) job._sm_config['rabbitmq'] = {} # avoid talking to RabbitMQ during the test ds = Dataset.load(db, ds_id) job.run(ds) # dataset table asserts rows = db.select('SELECT id, name, input_path, upload_dt, status from dataset') input_path = join(dirname(__file__), 'data', test_ds_name) assert len(rows) == 1 assert rows[0] == (ds_id, test_ds_name, input_path, upload_dt, DatasetStatus.FINISHED) # ms acquisition geometry asserts rows = db.select('SELECT acq_geometry from dataset') assert len(rows) == 1 assert rows[0][0] == ds.get_acq_geometry(db) assert rows[0][0] == { ACQ_GEOMETRY_KEYS.LENGTH_UNIT: 'nm', ACQ_GEOMETRY_KEYS.AcqGridSection.section_name: { ACQ_GEOMETRY_KEYS.AcqGridSection.REGULAR_GRID: True, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_X : 3, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_Y : 3, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_X : 100, ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_Y : 100 }, ACQ_GEOMETRY_KEYS.PixelSizeSection.section_name: { ACQ_GEOMETRY_KEYS.PixelSizeSection.REGULAR_SIZE: True, ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_X : 100, ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_Y : 100 } } # job table asserts rows = db.select('SELECT db_id, ds_id, status, start, finish from job') assert len(rows) == 1 db_id, ds_id, status, start, finish = rows[0] assert (db_id, ds_id, status) == (0, '2000-01-01_00h00m', 'FINISHED') assert start < finish # image metrics asserts rows = db.select(('SELECT db_id, sf, adduct, stats, iso_image_ids ' 'FROM iso_image_metrics ' 'ORDER BY sf, adduct')) assert rows[0] == (0, 'C12H24O', '+K', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9, 'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]}, ['iso_image_1', None, None, None]) assert rows[1] == (0, 'C12H24O', '+Na', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9, 'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]}, ['iso_image_1', None, None, None]) time.sleep(1) # Waiting for ES # ES asserts ds_docs = es_dsl_search.query('term', _type='dataset').execute().to_dict()['hits']['hits'] assert 1 == len(ds_docs) ann_docs = es_dsl_search.query('term', _type='annotation').execute().to_dict()['hits']['hits'] assert len(ann_docs) == len(rows) for doc in ann_docs: assert doc['_id'].startswith(ds_id) finally: db.close() with warn_only(): local('rm -rf {}'.format(data_dir_path))
def fill_db(test_db, metadata, ds_config): upload_dt = '2000-01-01 00:00:00' ds_id = '2000-01-01' db = DB() db.insert( 'INSERT INTO dataset (' ' id, name, input_path, upload_dt, metadata, config, status, status_update_dt, is_public' ') values (%s, %s, %s, %s, %s, %s, %s, %s, %s)', rows=[( ds_id, 'ds_name', 'input_path', upload_dt, json.dumps(metadata), json.dumps(ds_config), 'FINISHED', upload_dt, True, )], ) moldb = create_test_molecular_db() db.insert("INSERT INTO job (id, moldb_id, ds_id) VALUES (%s, %s, %s)", rows=[(0, moldb.id, ds_id)]) db.insert( ("INSERT INTO annotation (job_id, formula, chem_mod, neutral_loss, adduct, " "msm, fdr, stats, iso_image_ids) VALUES (%s, %s, '', '', %s, 0.5, 0.2, '{}', %s)" ), rows=[ (0, 'H2O', '+H', ['iso_image_11', 'iso_image_12']), (0, 'CH4', '+H', ['iso_image_21', 'iso_image_22']), ], ) user_id = str(uuid.uuid4()) db.insert( "INSERT INTO graphql.user (id, name, email) VALUES (%s, %s, %s)", rows=[(user_id, 'name', '*****@*****.**')], ) group_id = str(uuid.uuid4()) db.insert( "INSERT INTO graphql.group (id, name, short_name) VALUES (%s, %s, %s)", rows=[(group_id, 'group name', 'short name')], ) db.insert( "INSERT INTO graphql.dataset (id, user_id, group_id) VALUES (%s, %s, %s)", rows=[('dataset id', user_id, group_id)], ) return {"moldb": moldb}