def migrate_optical_images(ds_id): output.print('Migrating optical images') with timeit(): output.print('Transferring images and updating database...') db = DB() rows = db.select(SEL_OPTICAL_IMGS, params=(ds_id,)) for opt_image_id, opt_image_url in rows: if not opt_image_url and opt_image_id: transfer_images( ds_id, 'optical_images', image_storage.OPTICAL, [opt_image_id], ) opt_image_url = image_storage.get_image_url( image_storage.OPTICAL, ds_id, opt_image_id ) db.alter(UPD_OPTICAL_IMGS, params=(opt_image_url, opt_image_id)) opt_thumb_id, opt_thumb_url = db.select_one(SEL_OPT_THUMB, params=(ds_id,)) if not opt_thumb_url and opt_thumb_id: transfer_images( ds_id, 'optical_images', image_storage.OPTICAL, [opt_thumb_id], ) opt_thumb_url = image_storage.get_image_url(image_storage.OPTICAL, ds_id, opt_thumb_id) db.alter(UPD_OPT_THUMB, params=(opt_thumb_url, ds_id))
def del_diagnostics(ds_id: str, job_ids: Optional[List[int]] = None): db = DB() if job_ids is None: existing = db.select_with_fields( 'SELECT id, images FROM dataset_diagnostic dd WHERE dd.ds_id = %s', [ds_id], ) else: existing = db.select_with_fields( 'SELECT id, images FROM dataset_diagnostic dd ' 'WHERE dd.ds_id = %s AND dd.job_id = ANY(%s)', [ds_id, job_ids], ) if existing: # Delete existing images image_ids = [ img['image_id'] for row in existing for img in row['images'] or [] ] image_storage.delete_images(image_storage.DIAG, ds_id, image_ids) # Delete existing DB rows db.alter( 'DELETE FROM dataset_diagnostic WHERE id = ANY(%s::uuid[])', ([row['id'] for row in existing], ), )
def create_test_db(): db_config = dict(database='postgres', user='******', host='localhost') db = DB(db_config, autocommit=True) db.alter('DROP DATABASE IF EXISTS sm_test') db.alter('CREATE DATABASE sm_test') db.close() local('psql -h localhost -U sm sm_test < {}'.format(join(proj_root(), 'scripts/create_schema.sql')))
def fin(): db_config = dict(database='postgres', user='******', host='localhost', password='******') db = DB(db_config, autocommit=True) db.alter('DROP DATABASE IF EXISTS sm_test') db.close()
def fin(): db = DB(db_config, autocommit=True) try: db.alter('DROP DATABASE IF EXISTS sm_test') except Exception as e: logging.getLogger('engine').warning( 'Drop sm_test database failed: %s', e) finally: db.close()
def create_test_db(): db_config = dict(database='postgres', user='******', host='localhost') db = DB(db_config, autocommit=True) db.alter('DROP DATABASE IF EXISTS sm_test') db.alter('CREATE DATABASE sm_test') db.close() local('psql -h localhost -U sm sm_test < {}'.format( join(proj_root(), 'scripts/create_schema.sql')))
def fill_test_db(create_test_db, drop_test_db): db_config = dict(database='sm_test', user='******', host='localhost', password='******') db = DB(db_config) try: db.alter('TRUNCATE dataset CASCADE') db.insert("INSERT INTO dataset VALUES (%s, %s, %s, %s, %s)", [(1, 'ds_name', '/ds_path', json.dumps({}), json.dumps({}))]) db.alter('TRUNCATE coordinates CASCADE') except: raise finally: db.close()
def create_fill_test_db(create_test_db, drop_test_db): db_config = dict(database='sm_test', user='******', host='localhost', password='******') db = DB(db_config) try: db.alter('TRUNCATE formula_db CASCADE') db.insert('INSERT INTO formula_db VALUES (%s, %s, %s)', [(0, '2016-01-01', 'HMDB')]) db.insert('INSERT INTO formula VALUES (%s, %s, %s, %s, %s)', [(9, 0, '04138', 'Au', 'Gold')]) db.insert('INSERT INTO agg_formula VALUES (%s, %s, %s, %s, %s)', [(9, 0, 'Au', ['04138'], ['Gold'])]) db.alter('TRUNCATE theor_peaks CASCADE') db.insert('INSERT INTO theor_peaks VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', [(0, 9, '+H', 0.01, 1, 10000, [100, 200], [10, 1], [], [])]) except: raise finally: db.close()
def migrate_ion_thumbnail(ds_id): output.print('Migrating ion thumbnail images') with timeit(): output.print('Transferring images and updating database...') db = DB() ion_thumb_id, ion_thumbnail_url = db.select_one(SEL_ION_THUMB, params=(ds_id,)) if not ion_thumbnail_url and ion_thumb_id: transfer_images( ds_id, 'ion_thumbnails', image_storage.THUMB, [ion_thumb_id], ) ion_thumb_url = image_storage.get_image_url(image_storage.THUMB, ds_id, ion_thumb_id) db.alter(UPD_ION_THUMB, params=(ion_thumb_url, ds_id))
def update_core_metabolome_database(): db = DB() rows = db.select_with_fields( "SELECT * FROM molecular_db WHERE name = 'core_metabolome_v3'") if rows: moldb = rows[0] logger.info(f'Updating molecular database: {moldb}') moldb['name'] = 'CoreMetabolome' moldb['version'] = 'v3' moldb['full_name'] = 'Core Metabolome Database' moldb[ 'description'] = 'METASPACE database of core mammalian metabolites and lipids' moldb['link'] = 'https://metaspace2020.eu' moldb['citation'] = ttdoc(tttext('In preparation')) moldb['group_id'] = None moldb['is_public'] = True db.alter( ("UPDATE molecular_db " "SET name = %s, version = %s, full_name = %s, description = %s," " link = %s, citation = %s, group_id = %s, is_public = %s " "WHERE id = %s;"), params=( moldb['name'], moldb['version'], moldb['full_name'], moldb['description'], moldb['link'], moldb['citation'], moldb['group_id'], moldb['is_public'], moldb['id'], ), ) rows = db.select_with_fields( "SELECT * FROM molecular_db WHERE name = 'CoreMetabolome'") if rows: logger.info(f'Updated database: {rows[0]}') else: logger.error(f'Did not find database "CoreMetabolome"')
def del_jobs(ds: Dataset, moldb_ids: Optional[Iterable[int]] = None): """ Delete a dataset's jobs for the specified moldbs, or all jobs if moldb_ids is None. Also cleans up the annotations from ElasticSearch and deletes their ion images. """ db = DB() es = ESExporter(db) if moldb_ids is None: moldb_ids = get_ds_moldb_ids(ds.id) moldbs = molecular_db.find_by_ids(moldb_ids) job_ids = DB().select_onecol( 'SELECT j.id FROM job j WHERE ds_id = %s AND moldb_id = ANY(%s)', (ds.id, list(moldb_ids))) del_diagnostics(ds.id, job_ids) for moldb in moldbs: logger.info( f'Deleting isotopic images: ds_id={ds.id} ds_name={ds.name} moldb={moldb}' ) img_id_rows = db.select_onecol( 'SELECT iso_image_ids ' 'FROM annotation m ' 'JOIN job j ON j.id = m.job_id ' 'JOIN dataset d ON d.id = j.ds_id ' 'WHERE ds_id = %s AND j.moldb_id = %s', (ds.id, moldb.id), ) image_ids = [ img_id for img_ids in img_id_rows for img_id in img_ids if img_id is not None ] image_storage.delete_images(image_storage.ISO, ds.id, image_ids) logger.info( f"Deleting job results: ds_id={ds.id} ds_name={ds.name} moldb={moldb}" ) db.alter('DELETE FROM job WHERE ds_id = %s and moldb_id = %s', (ds.id, moldb.id)) es.delete_ds(ds.id, moldb)
def update_public_database_descriptions(): db = DB() public_db_names = db.select( 'SELECT name FROM molecular_db WHERE is_public = true AND archived = false' ) logger.info(f'Updating public molecular databases: {public_db_names}') for (name, ) in public_db_names: desc = database_descriptions.get(name, None) if desc: db.alter( "UPDATE molecular_db " "SET description = %s, full_name = %s, link = %s, citation = %s " "WHERE name = %s;", params=( desc['description'], desc['full_name'], desc['link'], desc['citation'], name, ), )
def ensure_db_populated(sm_config, analysis_version, database): db = DB() # Install DB schema if needed query = "SELECT COUNT(*) FROM pg_tables WHERE schemaname = 'public' AND tablename = 'dataset'" tables_exist = db.select_one(query)[0] >= 1 if not tables_exist: print('Installing DB schema') db.alter(DB_SQL_SCHEMA) # Import HMDB if needed moldb = MOL_DBS[database] try: molecular_db.find_by_name_version(moldb['name'], moldb['version']) except SMError: print(f'Importing {database}') with TemporaryDirectory() as tmp: urlretrieve(moldb['url'], f'{tmp}/moldb.tsv') molecular_db.create(moldb['name'], moldb['version'], f'{tmp}/moldb.tsv') if analysis_version > 1: if len( db.select( "SELECT name FROM scoring_model WHERE name = 'v3_default'") ) == 0: print("Importing v3_default scoring model") params = upload_catboost_scoring_model( model=Path(proj_root()) / '../scoring-models/v3_default/model-2022-01-05T13-45-26.947188-416b1311.cbm', bucket=sm_config['lithops']['lithops']['storage_bucket'], prefix=f'test_scoring_models/v3_default', is_public=False, ) save_scoring_model_to_db(name='v3_default', type_='catboost', params=params)
def test_db(sm_config, request): db_config = dict(**sm_config['db']) db_config['database'] = 'postgres' db = DB(db_config, autocommit=True) db.alter('DROP DATABASE IF EXISTS sm_test') db.alter('CREATE DATABASE sm_test') db.close() local('psql -h {} -U {} sm_test < {}'.format( sm_config['db']['host'], sm_config['db']['user'], Path(proj_root()) / 'scripts/create_schema.sql')) def fin(): db = DB(db_config, autocommit=True) try: db.alter('DROP DATABASE IF EXISTS sm_test') except Exception as e: logging.getLogger('engine').warning( 'Drop sm_test database failed: %s', e) finally: db.close() request.addfinalizer(fin)
class SearchJob(object): """ Main class responsible for molecule search. Uses other modules of the engine. Args ---- no_clean : bool Don't delete interim data files """ def __init__(self, img_store=None, no_clean=False): self.no_clean = no_clean self._img_store = img_store self._job_id = None self._sc = None self._db = None self._ds = None self._ds_reader = None self._status_queue = None self._fdr = None self._wd_manager = None self._es = None self._sm_config = SMConfig.get_conf() logger.debug('Using SM config:\n%s', pformat(self._sm_config)) def _configure_spark(self): logger.info('Configuring Spark') sconf = SparkConf() for prop, value in self._sm_config['spark'].items(): if prop.startswith('spark.'): sconf.set(prop, value) if 'aws' in self._sm_config: sconf.set("spark.hadoop.fs.s3a.access.key", self._sm_config['aws']['aws_access_key_id']) sconf.set("spark.hadoop.fs.s3a.secret.key", self._sm_config['aws']['aws_secret_access_key']) sconf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") sconf.set("spark.hadoop.fs.s3a.endpoint", "s3.{}.amazonaws.com".format(self._sm_config['aws']['aws_region'])) self._sc = SparkContext(master=self._sm_config['spark']['master'], conf=sconf, appName='SM engine') def _init_db(self): logger.info('Connecting to the DB') self._db = DB(self._sm_config['db']) def store_job_meta(self, mol_db_id): """ Store search job metadata in the database """ logger.info('Storing job metadata') rows = [(mol_db_id, self._ds.id, 'STARTED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))] self._job_id = self._db.insert_return(JOB_INS, rows=rows)[0] def _run_annotation_job(self, mol_db): try: self.store_job_meta(mol_db.id) mol_db.set_job_id(self._job_id) logger.info("Running new job ds_id: %s, ds_name: %s, db_name: %s, db_version: %s", self._ds.id, self._ds.name, mol_db.name, mol_db.version) target_adducts = self._ds.config['isotope_generation']['adducts'] self._fdr = FDR(job_id=self._job_id, decoy_sample_size=20, target_adducts=target_adducts, db=self._db) isocalc = IsocalcWrapper(self._ds.config['isotope_generation']) centroids_gen = IonCentroidsGenerator(sc=self._sc, moldb_name=mol_db.name, isocalc=isocalc) polarity = self._ds.config['isotope_generation']['charge']['polarity'] all_adducts = list(set(self._sm_config['defaults']['adducts'][polarity]) | set(DECOY_ADDUCTS)) centroids_gen.generate_if_not_exist(isocalc=isocalc, sfs=mol_db.sfs, adducts=all_adducts) target_ions = centroids_gen.ions(target_adducts) self._fdr.decoy_adducts_selection(target_ions) search_alg = MSMBasicSearch(sc=self._sc, ds=self._ds, ds_reader=self._ds_reader, mol_db=mol_db, centr_gen=centroids_gen, fdr=self._fdr, ds_config=self._ds.config) ion_metrics_df, ion_iso_images = search_alg.search() search_results = SearchResults(mol_db.id, self._job_id, search_alg.metrics.keys()) mask = self._ds_reader.get_2d_sample_area_mask() img_store_type = self._ds.get_ion_img_storage_type(self._db) search_results.store(ion_metrics_df, ion_iso_images, mask, self._db, self._img_store, img_store_type) except Exception as e: self._db.alter(JOB_UPD, params=('FAILED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id)) msg = 'Job failed(ds_id={}, mol_db={}): {}'.format(self._ds.id, mol_db, str(e)) raise JobFailedError(msg) from e else: self._export_search_results_to_es(mol_db, isocalc) def _export_search_results_to_es(self, mol_db, isocalc): try: self._es.index_ds(self._ds.id, mol_db, isocalc) except Exception as e: self._db.alter(JOB_UPD, params=('FAILED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id)) msg = 'Export to ES failed(ds_id={}, mol_db={}): {}'.format(self._ds.id, mol_db, str(e)) raise ESExportFailedError(msg) from e else: self._db.alter(JOB_UPD, params=('FINISHED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id)) def _remove_annotation_job(self, mol_db): logger.info("Removing job results ds_id: %s, ds_name: %s, db_name: %s, db_version: %s", self._ds.id, self._ds.name, mol_db.name, mol_db.version) self._db.alter('DELETE FROM job WHERE ds_id = %s and db_id = %s', params=(self._ds.id, mol_db.id)) self._es.delete_ds(self._ds.id, mol_db) def _moldb_ids(self): moldb_service = MolDBServiceWrapper(self._sm_config['services']['mol_db']) completed_moldb_ids = {moldb_service.find_db_by_id(db_id)['id'] for (_, db_id) in self._db.select(JOB_ID_MOLDB_ID_SEL, params=(self._ds.id,))} new_moldb_ids = {moldb_service.find_db_by_name_version(moldb_name)[0]['id'] for moldb_name in self._ds.config['databases']} return completed_moldb_ids, new_moldb_ids def _save_data_from_raw_ms_file(self): ms_file_type_config = SMConfig.get_ms_file_handler(self._wd_manager.local_dir.ms_file_path) acq_geometry_factory_module = ms_file_type_config['acq_geometry_factory'] acq_geometry_factory = getattr(import_module(acq_geometry_factory_module['path']), acq_geometry_factory_module['name']) acq_geometry = acq_geometry_factory(self._wd_manager.local_dir.ms_file_path).create() self._ds.save_acq_geometry(self._db, acq_geometry) self._ds.save_ion_img_storage_type(self._db, ms_file_type_config['img_storage_type']) def run(self, ds): """ Entry point of the engine. Molecule search is completed in several steps: * Copying input data to the engine work dir * Conversion input mass spec files to plain text format. One line - one spectrum data * Generation and saving to the database theoretical peaks for all formulas from the molecule database * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner. * Saving results (isotope images and their metrics of quality for each putative molecule) to the database Args ---- ds : sm.engine.dataset_manager.Dataset """ try: start = time.time() self._init_db() self._es = ESExporter(self._db) self._ds = ds if self._sm_config['rabbitmq']: self._status_queue = QueuePublisher(config=self._sm_config['rabbitmq'], qdesc=SM_DS_STATUS, logger=logger) else: self._status_queue = None ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.STARTED) self._wd_manager = WorkDirManager(ds.id) self._configure_spark() if not self.no_clean: self._wd_manager.clean() self._ds_reader = DatasetReader(self._ds.input_path, self._sc, self._wd_manager) self._ds_reader.copy_convert_input_data() self._save_data_from_raw_ms_file() self._img_store.storage_type = self._ds.get_ion_img_storage_type(self._db) ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.STARTED) logger.info('Dataset config:\n%s', pformat(self._ds.config)) completed_moldb_ids, new_moldb_ids = self._moldb_ids() for moldb_id in completed_moldb_ids.symmetric_difference(new_moldb_ids): # ignore ids present in both sets mol_db = MolecularDB(id=moldb_id, db=self._db, iso_gen_config=self._ds.config['isotope_generation']) if moldb_id not in new_moldb_ids: self._remove_annotation_job(mol_db) elif moldb_id not in completed_moldb_ids: self._run_annotation_job(mol_db) ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.FINISHED) logger.info("All done!") time_spent = time.time() - start logger.info('Time spent: %d mins %d secs', *divmod(int(round(time_spent)), 60)) except Exception as e: if self._ds: ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.FAILED) logger.error(e, exc_info=True) raise finally: if self._sc: self._sc.stop() if self._db: self._db.close() if self._wd_manager and not self.no_clean: self._wd_manager.clean() logger.info('*' * 150)
class SciTester: def __init__(self, sm_config, analysis_version, database): reports_path = Path(proj_root()) / 'tests/reports' timestamp = datetime.now().replace(microsecond=0).isoformat().replace( ':', '-') suffix = f'{database}-v{analysis_version}' self.sm_config = sm_config self.db = DB() self.ds_id = '2000-01-01_00h00m01s' self.ref_results_path = reports_path / f'spheroid-{suffix}.csv' self.output_results_path = reports_path / f'test-{suffix}-{timestamp}.csv' self.ds_name = 'sci_test_spheroid_untreated' self.ds_data_path = join(self.sm_config['fs']['spark_data_path'], self.ds_name) self.moldb = MOL_DBS[database] self.analysis_version = analysis_version self.input_path = join(proj_root(), 'tests/data/untreated') self.ds_config_path = join(self.input_path, 'config.json') self.metrics = [ 'chaos', 'spatial', 'spectral', 'mz_err_abs', 'mz_err_rel', 'msm', 'fdr' ] self.comparison_df = None def fetch_search_res_df(self): query = ("SELECT m.formula, m.adduct, m.msm, m.fdr, m.stats " "FROM annotation m " "JOIN job j ON j.id = m.job_id " "WHERE j.ds_id = %s " "ORDER BY formula, adduct ") rows = self.db.select_with_fields(query, params=(self.ds_id, )) return pd.DataFrame([{ 'formula': r['formula'], 'adduct': r['adduct'], 'msm': r['msm'], 'fdr': r['fdr'], **r['stats'], } for r in rows]) def save_reference_results(self): results_df = self.fetch_search_res_df() cols = ['formula', 'adduct', *self.metrics] results_df[cols].to_csv(self.ref_results_path, index=False) print( f'Successfully saved reference search results to {self.ref_results_path}' ) def save_comparison_results(self): self.comparison_df.to_csv(self.output_results_path, index=False) @staticmethod def print_metric_hist(metric_vals): if 0.2 < np.max(metric_vals) - np.min(metric_vals) <= 3.0: # For metrics in the range -1.0 to 1.0, aligned bins of 0.1 are easier to read min_edge = np.floor(np.min(metric_vals) * 10) / 10 max_edge = np.ceil(np.max(metric_vals) * 10) / 10 n_bins = int(np.round((max_edge - min_edge) * 10)) else: # Otherwise use unaligned bins min_edge = np.min(metric_vals) max_edge = np.max(metric_vals) n_bins = 10 bins = np.linspace(min_edge, max_edge, n_bins + 1) metric_freq, metric_interv = np.histogram(metric_vals, bins=bins) for lo, hi, freq in zip(metric_interv[:-1], metric_interv[1:], metric_freq): print(f'{lo:f}-{hi:f}: {freq}') def print_differences(self): df = self.comparison_df missing_df = df[df.matching == 'ref_only'] unexpected_df = df[df.matching == 'new_only'] common_df = df[df.matching == ''] n_ref = df.matching.isin({'ref_only', ''}).count() n_new = df.matching.isin({'new_only', ''}).count() print( f'MISSED FORMULAS: {len(missing_df)} ({len(missing_df) * 100 / n_ref:.1f}%)' ) print( f'FALSE DISCOVERY: {len(unexpected_df)} ({len(unexpected_df) * 100 / n_new:.1f}%)' ) differing_metrics = [ metric for metric in self.metrics if common_df[f'{metric}_differs'].any() ] if differing_metrics: for metric in differing_metrics: print(f'{metric}_new - {metric}_ref histogram: ') self.print_metric_hist(common_df[f'{metric}_new'] - common_df[f'{metric}_ref']) print() else: print('All metrics equal in common annotations') def fdr_differs(self, fdr_ref, fdr_new): if self.analysis_version == 1: # FDRs are quantized - allow them to jump up/down one level levels = [0.0501, 0.1001, 0.2001, 0.5001] ref_level = next( (i for i, level in enumerate(levels) if fdr_ref < level), len(levels)) new_level = next( (i for i, level in enumerate(levels) if fdr_new < level), len(levels)) return abs(ref_level - new_level) > 1 else: # Allow +/- 10% relative difference OR +/- 5% FDR absolute difference to compensate for # possible differences if the decoys are sampled differently. return not np.isclose(fdr_ref, fdr_new, rtol=0.1, atol=0.05) def make_comparison_df(self): ref_results = pd.read_csv(self.ref_results_path) new_results = self.fetch_search_res_df() df = ref_results.merge( new_results, on=['formula', 'adduct'], how='outer', suffixes=('_ref', '_new'), indicator='matching', ) df['matching'] = df.matching.cat.rename_categories({ 'left_only': 'ref_only', 'right_only': 'new_only', 'both': '' }) # Interleave columns for easy side-by-side comparison cols = ['formula', 'adduct', 'matching'] for col in self.metrics: cols.append(f'{col}_ref') cols.append(f'{col}_new') df = df[cols] # Add "differs" fields indicating whether the values have changed enough to be considered # different from the originals. for col in self.metrics: if col == 'fdr': df[f'fdr_differs'] = [ self.fdr_differs(row.fdr_ref, row.fdr_new) for row in df[['fdr_ref', 'fdr_new']].itertuples() ] else: df[f'{col}_differs'] = ~np.isclose(df[f'{col}_ref'], df[f'{col}_new']) self.comparison_df = df def search_results_are_different(self): annotations_mismatch = (self.comparison_df.matching != '').any() metrics_differ = any(self.comparison_df[f'{metric}_differs'].any() for metric in self.metrics) return annotations_mismatch or metrics_differ @classmethod def _patch_image_storage(cls): class ImageStorageMock: ISO = image_storage.ISO def __init__(self, *args, **kwargs): pass def post_image(self, *args, **kwargs): pass from sm.engine.annotation_spark import search_results search_results.ImageStorage = ImageStorageMock def run_search(self, store_images=False, use_lithops=False): if not store_images: self._patch_image_storage() moldb_id = molecular_db.find_by_name_version(self.moldb['name'], self.moldb['version']).id os.environ['PYSPARK_PYTHON'] = sys.executable ds = create_ds_from_files(self.ds_id, self.ds_name, self.input_path) ds.config['analysis_version'] = self.analysis_version ds.config['fdr'][ 'scoring_model'] = 'v3_default' if self.analysis_version > 1 else None ds.config['database_ids'] = [moldb_id] self.db.alter('DELETE FROM job WHERE ds_id=%s', params=(ds.id, )) ds.save(self.db, allow_insert=True) perf = NullProfiler() if use_lithops: # Override the runtime to force it to run without docker. lithops_executor.RUNTIME_CF_VPC = 'python' lithops_executor.RUNTIME_CE = 'python' executor = Executor(self.sm_config['lithops'], perf) job = ServerAnnotationJob( executor, ds, perf, self.sm_config, store_images=store_images, ) job.run(debug_validate=True) else: AnnotationJob(ds, perf).run() self.make_comparison_df() def clear_data_dirs(self): path = Path(self.ds_data_path) if path.exists(): path.rmdir()
'BraChemDB-2018-01', 'ChEBI-2018-01', 'ECMDB-2018-12', 'HMDB-v4', 'HMDB-v4-cotton', 'HMDB-v4-endogenous', 'LipidMaps-2017-12-12', 'PAMDB-v1.0', 'SwissLipids-2018-02-02', 'HMDB', 'ChEBI', 'LIPID_MAPS', 'SwissLipids', 'COTTON_HMDB', 'HMDB-v2.5', 'HMDB-v2.5-cotton', ) db.alter(PUBLIC_MOLDB_UPD, params=(PUBLIC_DATABASE_NAMES, )) ARCHIVED_MOLDB_UPD = 'UPDATE molecular_db SET archived = true WHERE name IN %s' ARCHIVED_DATABASE_NAMES = ( 'HMDB', 'ChEBI', 'LIPID_MAPS', 'SwissLipids', 'COTTON_HMDB', 'HMDB-v2.5', 'HMDB-v2.5-cotton', ) db.alter(ARCHIVED_MOLDB_UPD, params=(ARCHIVED_DATABASE_NAMES, ))
import pandas as pd from sm.engine.db import DB, ConnectionPool from sm.engine.config import init_loggers if __name__ == '__main__': parser = argparse.ArgumentParser( description='Update molecular database molecule names') parser.add_argument('--config', default='conf/config.json', help='SM config path') parser.add_argument('file_path', help='Path to file with new names') args = parser.parse_args() init_loggers() logger = logging.getLogger('engine') logger.info(f'Importing new names from {args.file_path}') db_config = {"host": "localhost", "database": "mol_db", "user": "******"} with ConnectionPool(db_config): db = DB() names_df = pd.read_csv(args.file_path, sep='\t')[['id', 'name']] sql = ( 'WITH molecule_name AS (SELECT UNNEST(%s::text[]) as id_, UNNEST(%s::text[]) as name_) ' 'UPDATE molecule SET mol_name = molecule_name.name_ ' 'FROM molecule_name WHERE molecule.mol_id = molecule_name.id_') db.alter(sql, [names_df.id.values.tolist(), names_df.name.values.tolist()])
def add_diagnostics(diagnostics: List[DatasetDiagnostic]): """Upserts dataset diagnostics, overwriting existing values with the same ds_id, job_id, type""" # Validate input, as postgres can't enforce the JSON columns have the correct schema, # and many places (graphql, python client, etc.) rely on these structures. if not diagnostics: return for diagnostic in diagnostics: assert 'ds_id' in diagnostic assert 'type' in diagnostic images = diagnostic.get('images', []) assert all(image['key'] in DiagnosticImageKey for image in images) assert all(image['format'] in DiagnosticImageFormat for image in images) assert all(image['image_id'] in image['url'] for image in images) image_keys = set( (image.get('key'), image.get('index')) for image in images) assert len(image_keys) == len( images), 'diagnostic image keys should be unique' db = DB() # Find all diagnostics that should be replaced by the new diagnostics existing = db.select_with_fields( """ WITH new_diagnostic AS ( SELECT UNNEST(%s::text[]) as ds_id, UNNEST(%s::int[]) as job_id, UNNEST(%s::text[]) as type ) SELECT dd.ds_id, dd.id, dd.images FROM new_diagnostic nd JOIN dataset_diagnostic dd ON nd.ds_id = dd.ds_id AND (nd.job_id = dd.job_id OR (nd.job_id IS NULL AND dd.job_id is NULL)) AND nd.type = dd.type """, list( map( list, zip(*((d['ds_id'], d.get('job_id'), d['type']) for d in diagnostics)))), ) if existing: logger.debug( f'Deleting {len(existing)} existing diagnostics for dataset {existing[0]["ds_id"]}' ) # Delete existing images image_ids_by_ds = defaultdict(list) for row in existing: for img in row['images'] or []: image_ids_by_ds[row['ds_id']].append(img['image_id']) for ds_id, image_ids in image_ids_by_ds.items(): image_storage.delete_images(image_storage.DIAG, ds_id, image_ids) # Delete existing DB rows db.alter( 'DELETE FROM dataset_diagnostic WHERE id = ANY(%s::uuid[])', ([row['id'] for row in existing], ), ) logger.debug( f'Inserting {len(diagnostics)} diagnostics for dataset {diagnostics[0]["ds_id"]}' ) db.insert( 'INSERT INTO dataset_diagnostic (ds_id, job_id, type, updated_dt, data, error, images) ' 'VALUES (%s, %s, %s, %s, %s, %s, %s)', [( d['ds_id'], d.get('job_id'), d['type'], datetime.now(), numpy_json_dumps(d['data']) if d.get('data') is not None else None, d.get('error'), numpy_json_dumps(d.get('images', [])), ) for d in diagnostics], )
class SearchJob(object): """ Main class responsible for molecule search. Uses other modules of the engine. Args ---- no_clean : bool Don't delete interim data files """ def __init__(self, img_store=None, no_clean=False): self.no_clean = no_clean self._img_store = img_store self._job_id = None self._sc = None self._db = None self._ds = None self._ds_reader = None self._status_queue = None self._fdr = None self._wd_manager = None self._es = None self._sm_config = SMConfig.get_conf() logger.debug('Using SM config:\n%s', pformat(self._sm_config)) def _configure_spark(self): logger.info('Configuring Spark') sconf = SparkConf() for prop, value in self._sm_config['spark'].items(): if prop.startswith('spark.'): sconf.set(prop, value) if 'aws' in self._sm_config: sconf.set("spark.hadoop.fs.s3a.access.key", self._sm_config['aws']['aws_access_key_id']) sconf.set("spark.hadoop.fs.s3a.secret.key", self._sm_config['aws']['aws_secret_access_key']) sconf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") sconf.set( "spark.hadoop.fs.s3a.endpoint", "s3.{}.amazonaws.com".format( self._sm_config['aws']['aws_region'])) self._sc = SparkContext(master=self._sm_config['spark']['master'], conf=sconf, appName='SM engine') def _init_db(self): logger.info('Connecting to the DB') self._db = DB(self._sm_config['db']) def store_job_meta(self, mol_db_id): """ Store search job metadata in the database """ logger.info('Storing job metadata') rows = [(mol_db_id, self._ds.id, JobStatus.RUNNING, datetime.now().strftime('%Y-%m-%d %H:%M:%S'))] self._job_id = self._db.insert_return(JOB_INS, rows=rows)[0] def _run_annotation_job(self, mol_db): try: self.store_job_meta(mol_db.id) mol_db.set_job_id(self._job_id) logger.info( "Running new job ds_id: %s, ds_name: %s, db_name: %s, db_version: %s", self._ds.id, self._ds.name, mol_db.name, mol_db.version) target_adducts = self._ds.config['isotope_generation']['adducts'] self._fdr = FDR(job_id=self._job_id, decoy_sample_size=20, target_adducts=target_adducts, db=self._db) isocalc = IsocalcWrapper(self._ds.config['isotope_generation']) centroids_gen = IonCentroidsGenerator(sc=self._sc, moldb_name=mol_db.name, isocalc=isocalc) polarity = self._ds.config['isotope_generation']['charge'][ 'polarity'] all_adducts = list( set(self._sm_config['defaults']['adducts'][polarity]) | set(DECOY_ADDUCTS)) centroids_gen.generate_if_not_exist(isocalc=isocalc, sfs=mol_db.sfs, adducts=all_adducts) target_ions = centroids_gen.ions(target_adducts) self._fdr.decoy_adducts_selection(target_ions) search_alg = MSMBasicSearch(sc=self._sc, ds=self._ds, ds_reader=self._ds_reader, mol_db=mol_db, centr_gen=centroids_gen, fdr=self._fdr, ds_config=self._ds.config) ion_metrics_df, ion_iso_images = search_alg.search() search_results = SearchResults(mol_db.id, self._job_id, search_alg.metrics.keys()) mask = self._ds_reader.get_2d_sample_area_mask() img_store_type = self._ds.get_ion_img_storage_type(self._db) search_results.store(ion_metrics_df, ion_iso_images, mask, self._db, self._img_store, img_store_type) except Exception as e: self._db.alter( JOB_UPD_STATUS_FINISH, params=(JobStatus.FAILED, datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id)) msg = 'Job failed(ds_id={}, mol_db={}): {}'.format( self._ds.id, mol_db, str(e)) raise JobFailedError(msg) from e else: self._db.alter( JOB_UPD_STATUS_FINISH, params=(JobStatus.FINISHED, datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id)) def _remove_annotation_job(self, mol_db): logger.info( "Removing job results ds_id: %s, ds_name: %s, db_name: %s, db_version: %s", self._ds.id, self._ds.name, mol_db.name, mol_db.version) self._db.alter('DELETE FROM job WHERE ds_id = %s and db_id = %s', params=(self._ds.id, mol_db.id)) self._es.delete_ds(self._ds.id, mol_db) def _moldb_ids(self): completed_moldb_ids = { db_id for (_, db_id) in self._db.select(JOB_ID_MOLDB_ID_SEL, params=(self._ds.id, )) } new_moldb_ids = { MolecularDB(name=moldb_name).id for moldb_name in self._ds.config['databases'] } return completed_moldb_ids, new_moldb_ids def _save_data_from_raw_ms_file(self): ms_file_type_config = SMConfig.get_ms_file_handler( self._wd_manager.local_dir.ms_file_path) acq_geometry_factory_module = ms_file_type_config[ 'acq_geometry_factory'] acq_geometry_factory = getattr( import_module(acq_geometry_factory_module['path']), acq_geometry_factory_module['name']) acq_geometry = acq_geometry_factory( self._wd_manager.local_dir.ms_file_path).create() self._ds.save_acq_geometry(self._db, acq_geometry) self._ds.save_ion_img_storage_type( self._db, ms_file_type_config['img_storage_type']) def run(self, ds): """ Entry point of the engine. Molecule search is completed in several steps: * Copying input data to the engine work dir * Conversion input mass spec files to plain text format. One line - one spectrum data * Generation and saving to the database theoretical peaks for all formulas from the molecule database * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner. * Saving results (isotope images and their metrics of quality for each putative molecule) to the database Args ---- ds : sm.engine.dataset_manager.Dataset """ try: logger.info('*' * 150) start = time.time() self._init_db() self._es = ESExporter(self._db) self._ds = ds if self._sm_config['rabbitmq']: self._status_queue = QueuePublisher( config=self._sm_config['rabbitmq'], qdesc=SM_DS_STATUS, logger=logger) else: self._status_queue = None self._wd_manager = WorkDirManager(ds.id) self._configure_spark() if not self.no_clean: self._wd_manager.clean() self._ds_reader = DatasetReader(self._ds.input_path, self._sc, self._wd_manager) self._ds_reader.copy_convert_input_data() self._save_data_from_raw_ms_file() self._img_store.storage_type = self._ds.get_ion_img_storage_type( self._db) logger.info('Dataset config:\n%s', pformat(self._ds.config)) completed_moldb_ids, new_moldb_ids = self._moldb_ids() for moldb_id in completed_moldb_ids.symmetric_difference( new_moldb_ids): # ignore ids present in both sets mol_db = MolecularDB( id=moldb_id, db=self._db, iso_gen_config=self._ds.config['isotope_generation']) if moldb_id not in new_moldb_ids: self._remove_annotation_job(mol_db) elif moldb_id not in completed_moldb_ids: self._run_annotation_job(mol_db) logger.info("All done!") time_spent = time.time() - start logger.info('Time spent: %d mins %d secs', *divmod(int(round(time_spent)), 60)) finally: if self._sc: self._sc.stop() if self._db: self._db.close() if self._wd_manager and not self.no_clean: self._wd_manager.clean() logger.info('*' * 150)
class SearchJob(object): """ Main class responsible for molecule search. Uses other modules of the engine. Args ---------- ds_name : string A dataset short name """ def __init__(self, client_email, ds_name): self.sm_config = SMConfig.get_conf() self.client_email = client_email self.ds_name = ds_name self.ds_id = None self.job_id = None self.sc = None self.db = None self.ds = None self.fdr = None self.formulas = None self.ds_config = None self.wd_manager = None def _read_ds_config(self): with open(self.wd_manager.ds_config_path) as f: self.ds_config = json.load(f) def _configure_spark(self): logger.info('Configuring Spark') sconf = SparkConf() for prop, value in self.sm_config['spark'].iteritems(): if prop.startswith('spark.'): sconf.set(prop, value) if 'aws' in self.sm_config: sconf.set("spark.hadoop.fs.s3a.access.key", self.sm_config['aws']['aws_access_key_id']) sconf.set("spark.hadoop.fs.s3a.secret.key", self.sm_config['aws']['aws_secret_access_key']) sconf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") # sconf.set("spark.python.profile", "true") self.sc = SparkContext(master=self.sm_config['spark']['master'], conf=sconf, appName='SM engine') if not self.sm_config['spark']['master'].startswith('local'): self.sc.addPyFile(join(local_path(proj_root()), 'sm.zip')) def _init_db(self): logger.info('Connecting to the DB') self.db = DB(self.sm_config['db']) self.sf_db_id = self.db.select_one( DB_ID_SEL, self.ds_config['database']['name'])[0] def store_job_meta(self): """ Store search job metadata in the database """ logger.info('Storing job metadata') self.ds_id = int(self.db.select_one(DS_ID_SEL, self.ds_name)[0]) self.job_id = self.ds_id self.db.alter(DEL_JOB_SQL, self.job_id) rows = [(self.job_id, self.sf_db_id, self.ds_id, datetime.now().strftime('%Y-%m-%d %H:%M:%S'))] self.db.insert(JOB_INS, rows) rows = [(self.job_id, adduct) for adduct in self.ds_config['isotope_generation']['adducts']] self.db.insert(ADDUCT_INS, rows) def run(self, input_path, ds_config_path, clean=False): """ Entry point of the engine. Molecule search is completed in several steps: * Copying input data to the engine work dir * Conversion input data (imzML+ibd) to plain text format. One line - one spectrum data * Generation and saving to the database theoretical peaks for all formulas from the molecule database * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner. * Saving results (isotope images and their metrics of quality for each putative molecule) to the database Args ------- input_path : string Path to the dataset folder with .imzML and .ibd files ds_config_path: string Path to the dataset config file clean : bool Clean all interim data files before starting molecule search """ try: self.wd_manager = WorkDirManager(self.ds_name) if clean: self.wd_manager.clean() self.wd_manager.copy_input_data(input_path, ds_config_path) self._read_ds_config() logger.info('Dataset config:\n%s', pformat(self.ds_config)) self._configure_spark() self._init_db() if not self.wd_manager.exists(self.wd_manager.txt_path): imzml_converter = ImzmlTxtConverter( self.ds_name, self.wd_manager.local_dir.imzml_path, self.wd_manager.local_dir.txt_path, self.wd_manager.local_dir.coord_path) imzml_converter.convert() if not self.wd_manager.local_fs_only: self.wd_manager.upload_to_remote() self.ds = Dataset(self.sc, self.ds_name, self.client_email, input_path, self.ds_config, self.wd_manager, self.db) self.ds.save_ds_meta() self.store_job_meta() theor_peaks_gen = TheorPeaksGenerator(self.sc, self.sm_config, self.ds_config) theor_peaks_gen.run() target_adducts = self.ds_config['isotope_generation']['adducts'] self.fdr = FDR(self.job_id, self.sf_db_id, decoy_sample_size=20, target_adducts=target_adducts, db=self.db) self.fdr.decoy_adduct_selection() self.formulas = FormulasSegm(self.job_id, self.sf_db_id, self.ds_config, self.db) # search_alg = MSMBasicSearch(self.sc, self.ds, self.formulas, self.fdr, self.ds_config) search_alg = MSMExtraFeats(self.sc, self.ds, self.formulas, self.fdr, self.ds_config) sf_metrics_df, sf_iso_images = search_alg.search() search_results = SearchResults( self.sf_db_id, self.ds_id, self.job_id, self.ds_name, self.formulas.get_sf_adduct_peaksn(), self.db, self.sm_config, self.ds_config) search_results.sf_metrics_df = sf_metrics_df search_results.sf_iso_images = sf_iso_images search_results.metrics = search_alg.metrics search_results.nrows, search_results.ncols = self.ds.get_dims() search_results.store() es = ESExporter(self.sm_config) es.index_ds(self.db, self.ds_name, self.ds_config['database']['name']) except Exception: exc_type, exc_value, exc_traceback = sys.exc_info() logger.error('\n'.join( traceback.format_exception(exc_type, exc_value, exc_traceback))) finally: if self.sc: # self.sc.show_profiles() self.sc.stop() if self.db: self.db.close()