def on_startup(config_path: str) -> Dict: from sm.engine import image_storage # pylint: disable=import-outside-toplevel,cyclic-import SMConfig.set_path(config_path) sm_config = SMConfig.get_conf() init_loggers(sm_config['logs']) if 'aws' in sm_config: populate_aws_env_vars(sm_config['aws']) image_storage.init(sm_config) return sm_config
def read_moldb_file(file_path): try: if re.findall(r'^s3a?://', file_path): bucket_name, key = split_s3_path(file_path) sm_config = SMConfig.get_conf() buffer = get_s3_bucket(bucket_name, sm_config).Object(key).get()['Body'] else: buffer = Path(file_path).open() moldb_df = pd.read_csv(buffer, sep='\t', dtype=object, na_filter=False) except ValueError as e: raise MalformedCSV(f'Malformed CSV: {e}') from e if moldb_df.empty: raise MalformedCSV('No data rows found') required_columns = {'id', 'name', 'formula'} if not required_columns.issubset(set(moldb_df.columns)): raise MalformedCSV( f'Missing columns. Provided: {moldb_df.columns.to_list()} Required: {required_columns}' ) parsing_errors = _validate_moldb_df(moldb_df) if parsing_errors: raise BadData('Failed to parse some rows', *parsing_errors) moldb_df.rename({ 'id': 'mol_id', 'name': 'mol_name' }, axis='columns', inplace=True) return moldb_df
def __init__( self, imzml_cobject: CloudObject, ibd_cobject: CloudObject, moldbs: List[InputMolDb], ds_config: DSConfig, executor: Executor = None, lithops_config=None, cache_key=None, use_db_cache=True, use_db_mutex=True, ): lithops_config = lithops_config or SMConfig.get_conf()['lithops'] self.lithops_config = lithops_config self._db = DB() self.imzml_cobject = imzml_cobject self.ibd_cobject = ibd_cobject self.moldbs = moldbs self.ds_config = ds_config self.isocalc_wrapper = IsocalcWrapper(ds_config) self.executor = executor or Executor(lithops_config) self.storage = self.executor.storage if cache_key is not None: self.cacher: Optional[PipelineCacher] = PipelineCacher( self.storage, cache_key, lithops_config) else: self.cacher = None self.use_db_cache = use_db_cache self.use_db_mutex = use_db_mutex self.ds_segm_size_mb = 128
def __init__(self, manager, lit_qdesc, annot_qdesc, upd_qdesc): self._sm_config = SMConfig.get_conf() self._stopped = False self._manager = manager self._lithops_queue_cons = QueueConsumer( config=self._sm_config['rabbitmq'], qdesc=lit_qdesc, logger=self.logger, poll_interval=1, callback=self._callback, on_success=self._on_success, on_failure=self._on_failure, ) self._lithops_queue_pub = QueuePublisher( config=self._sm_config['rabbitmq'], qdesc=lit_qdesc, logger=self.logger) self._annot_queue_pub = QueuePublisher( config=self._sm_config['rabbitmq'], qdesc=annot_qdesc, logger=self.logger) self._update_queue_pub = QueuePublisher( config=self._sm_config['rabbitmq'], qdesc=upd_qdesc, logger=self.logger)
def reprocess_dataset_local(sm_src, src_ds_id, dst_ds_id, update_metadata_func, skip_existing=True, use_cache=False): existing = get_dataset_diagnostics(dst_ds_id) if skip_existing and existing: print(f'Skipping {dst_ds_id}\n', end=None) return dst_ds_id, None smds = sm_src.dataset(id=src_ds_id) db = DB() ds_metadata, ds_config = update_metadata_func(smds.metadata, smds.config) ds = Dataset( id=dst_ds_id, name=smds.name, input_path=smds.s3dir, upload_dt=datetime.now(), metadata=ds_metadata, config=ds_config, status=DatasetStatus.QUEUED, status_update_dt=None, is_public=False, ) ds.save(db, None, True) with perf_profile(db, 'annotate_lithops', dst_ds_id) as perf: executor = Executor(SMConfig.get_conf()['lithops'], perf=perf) job = ServerAnnotationJob(executor, ds, perf, use_cache=use_cache) job.pipe.use_db_cache = False job.run() return dst_ds_id
def main(): parser = argparse.ArgumentParser( description='Merge mol_dbs and adducts into config') parser.add_argument('--config', default='conf/config.json', help='SM config path') args = parser.parse_args() SMConfig.set_path(args.config) init_loggers(SMConfig.get_conf()['logs']) conf = SMConfig.get_conf() with ConnectionPool(conf['db']): db = DB() populate_ion_formula(db) populate_ions(db) populate_ion_id(db)
def _save_data_from_raw_ms_file(self, imzml_reader: FSImzMLReader): ms_file_path = imzml_reader.filename ms_file_type_config = SMConfig.get_ms_file_handler(ms_file_path) dims = (imzml_reader.h, imzml_reader.w) acq_geometry = make_acq_geometry( ms_file_type_config['type'], ms_file_path, self._ds.metadata, dims ) self._ds.save_acq_geometry(self._db, acq_geometry)
def __init__(self, db, sm_config=None): self.sm_config = sm_config or SMConfig.get_conf() self._es: Elasticsearch = init_es_conn(self.sm_config['elasticsearch']) self._ingest: IngestClient = IngestClient(self._es) self._db = db self._ds_locker = DBMutex(self.sm_config['db']) self.index = self.sm_config['elasticsearch']['index'] self._get_mol_by_formula_dict_cache = dict()
def __init__( self, imzml_file: str, ibd_file: str, moldb_files: Union[List[int], List[str]], ds_config: DSConfig, sm_config: Optional[Dict] = None, use_cache=True, out_dir: Optional[str] = None, executor: Optional[Executor] = None, ): sm_config = sm_config or SMConfig.get_conf() self.storage = Storage(config=sm_config['lithops']) sm_storage = sm_config['lithops']['sm_storage'] self.imzml_cobj = _upload_if_needed(imzml_file, self.storage, sm_storage, 'imzml', use_db_mutex=False) self.ibd_cobj = _upload_if_needed(ibd_file, self.storage, sm_storage, 'imzml', use_db_mutex=False) if isinstance(moldb_files[0], int): self.moldb_defs = _upload_moldbs_from_db(moldb_files, self.storage, sm_storage) else: self.moldb_defs = _upload_moldbs_from_files( moldb_files, self.storage, sm_storage) self.ds_config = ds_config self.out_dir = Path(out_dir) if out_dir else Path('./result_pngs') if use_cache: cache_key: Optional[str] = jsonhash({ 'imzml': imzml_file, 'ibd': ibd_file, 'dbs': moldb_files, 'ds': ds_config }) else: cache_key = None self.pipe = Pipeline( self.imzml_cobj, self.ibd_cobj, self.moldb_defs, self.ds_config, executor=executor, cache_key=cache_key, use_db_cache=use_cache, use_db_mutex=False, lithops_config=sm_config['lithops'], )
def store_images_to_s3( executor: Executor, ds_id: str, formula_i_to_db_id: pd.Series, png_cobjs: List[CObj[List[Tuple[int, bytes]]]], ) -> DbFormulaImagesDict: """ Upload PNG isotopic images to S3 image storage. Images may be uploaded multiple times if a formula_i is in multiple databases (i.e. there are duplicates in the formula_i_to_db_id index). This is intentional, as there's no check for reused images when deleting individual dataset jobs e.g. by removing a moldb without reprocessing. It's easier to just avoid ever reusing images. """ sm_config = SMConfig.get_conf() def _upload_png_batch(png_cobj: CObj[List[Tuple[int, bytes]]], *, storage: Storage, perf: SubtaskProfiler): def _upload_images(pngs): return [ image_storage.post_image(image_storage.ISO, ds_id, png) if png is not None else None for png in pngs ] formula_png_chunk = load_cobj(storage, png_cobj) image_storage = ImageStorage(sm_config) n_images = 0 tasks = (pd.DataFrame(formula_png_chunk, columns=['formula_i', 'pngs']).set_index('formula_i').join( formula_i_to_db_id, how='inner')) # Limit parallelism to 6 to avoid accidentally hitting S3's upload limit (3500 PUTs/s) # The default parallelism (8 threads, because Cloud Functions get 4 CPUs) is slightly # too high for datasets with a lot of images. with ThreadPoolExecutor(6) as executor: db_formula_image_ids: DbFormulaImagesDict = defaultdict(dict) for db_id, formula_id, image_ids in zip( tasks.moldb_id, tasks.index, executor.map(_upload_images, tasks.pngs)): db_formula_image_ids[db_id][formula_id] = image_ids n_images += len([i for i in image_ids if i is not None]) perf.add_extra_data(n_tasks=len(tasks), n_images=n_images) return db_formula_image_ids results = executor.map(_upload_png_batch, [(cobj, ) for cobj in png_cobjs], runtime_memory=512) db_formula_image_ids: DbFormulaImagesDict = defaultdict(dict) for result in results: for db_id, db_result in result.items(): db_formula_image_ids[db_id].update(db_result) return db_formula_image_ids
def sm_config(): SMConfig.set_path(Path(proj_root()) / TEST_CONFIG_PATH) SMConfig.get_conf( update=True) # Force reload in case previous tests modified it worker_id = os.environ.get('PYTEST_XDIST_WORKER', 'gw0') test_id = f'sm_test_{worker_id}' # Update the internal cached copy of the config, so independent calls to SMConfig.get_conf() # also get the updated config SMConfig._config_dict['db']['database'] = test_id SMConfig._config_dict['elasticsearch']['index'] = test_id SMConfig._config_dict['rabbitmq']['prefix'] = f'test_{worker_id}__' for path in SMConfig._config_dict['lithops']['sm_storage'].values(): # prefix keys with test ID so they can be cleaned up later path[1] = f'{test_id}/{path[1]}' # __LITHOPS_SESSION_ID determines the prefix to use for anonymous cloudobjects os.environ['__LITHOPS_SESSION_ID'] = f'{test_id}/cloudobjects' return SMConfig.get_conf()
def __init__( self, ds: Dataset, perf: Profiler, sm_config: Optional[Dict] = None, ): self._sm_config = sm_config or SMConfig.get_conf() self._sc = None self._db = DB() self._ds = ds self._perf = perf self._es = ESExporter(self._db, self._sm_config) self._ds_data_path = None
def save_additional_info_to_db(db_id, user_id, input_path): conf = SMConfig.get_conf() with ConnectionPool(conf['db']): db = DB() if db.select_one('SELECT * FROM molecular_db WHERE id = %s', (db_id, )): print(f'Updating existing molecular database {db_id}') DB().alter( 'UPDATE molecular_db SET user_id = %s, input_path = %s WHERE id = %s', (user_id, input_path, db_id), ) else: print(f'Specified molecular database {db_id} does not exist.')
def __init__( self, executor: Executor, ds: Dataset, perf: Profiler, sm_config: Optional[Dict] = None, use_cache=False, store_images=True, ): """ Args ======== use_cache: For development - cache the results after each pipeline step so that it's easier to quickly re-run specific steps. """ sm_config = sm_config or SMConfig.get_conf() self.sm_storage = sm_config['lithops']['sm_storage'] self.storage = Storage(sm_config['lithops']) self.s3_client = get_s3_client() self.ds = ds self.perf = perf self.store_images = store_images self.db = DB() self.es = ESExporter(self.db, sm_config) self.imzml_cobj, self.ibd_cobj = _upload_imzmls_from_prefix_if_needed( self.ds.input_path, self.storage, self.sm_storage, self.s3_client) self.moldb_defs = _upload_moldbs_from_db( self.ds.config['database_ids'], self.storage, self.sm_storage) if use_cache: cache_key: Optional[str] = jsonhash({ 'input_path': ds.input_path, 'ds': ds.config }) else: cache_key = None self.pipe = Pipeline( self.imzml_cobj, self.ibd_cobj, self.moldb_defs, self.ds.config, cache_key=cache_key, executor=executor, ) self.results_dfs = None self.png_cobjs = None self.db_formula_image_ids = None
def main(): parser = argparse.ArgumentParser( description='Migrate MolDB data from service to database') parser.add_argument('--config', default='conf/config.json', help='SM config path') args = parser.parse_args() SMConfig.set_path(args.config) config = SMConfig.get_conf() init_loggers(config['logs']) moldb_db_config = { 'host': 'localhost', 'database': 'mol_db', 'user': '******' } dump_moldb_tables(moldb_db_config) import_moldb_tables(config['db']) os.remove('/tmp/molecule.csv') os.remove('/tmp/molecular_db.csv')
def __init__(self, db, es, status_queue=None, logger=None, sm_config=None): self._sm_config = sm_config or SMConfig.get_conf() self._slack_conf = self._sm_config.get('slack', {}) self._db: DB = db self._es: ESExporter = es self._status_queue = status_queue self.logger = logger or logging.getLogger() if 'aws' in self._sm_config: self.ses = boto3.client( 'ses', 'eu-west-1', aws_access_key_id=self._sm_config['aws']['aws_access_key_id'], aws_secret_access_key=self._sm_config['aws']['aws_secret_access_key'], )
def __init__( self, spark_context: pyspark.SparkContext, imzml_reader: FSImzMLReader, moldbs: List[MolecularDB], ds_config: DSConfig, ds_data_path: Path, perf: Profiler, ): self._spark_context = spark_context self._ds_config = ds_config self._imzml_reader = imzml_reader self._moldbs = moldbs self._sm_config = SMConfig.get_conf() self._ds_data_path = ds_data_path self._perf = perf
def __init__( self, db, es, logger=None, annot_queue=None, update_queue=None, lit_queue=None, status_queue=None, ): self._sm_config = SMConfig.get_conf() self._db = db self._es = es self._annot_queue = annot_queue self._update_queue = update_queue self._lit_queue = lit_queue self._status_queue = status_queue self.logger = logger or logging.getLogger()
def __init__(self, manager, annot_qdesc, upd_qdesc, poll_interval=1): self._sm_config = SMConfig.get_conf() self._stopped = False self._manager = manager self._annot_queue_consumer = QueueConsumer( config=self._sm_config['rabbitmq'], qdesc=annot_qdesc, callback=self._callback, on_success=self._on_success, on_failure=self._on_failure, logger=self.logger, poll_interval=poll_interval, ) self._update_queue_pub = QueuePublisher( config=self._sm_config['rabbitmq'], qdesc=upd_qdesc, logger=self.logger) self._redis_client = redis.Redis(**self._sm_config.get('redis', {})) Path(self._sm_config['fs']['spark_data_path']).mkdir(parents=True, exist_ok=True)
def _get_isotope_generation_from_metadata(metadata): assert 'MS_Analysis' in metadata sm_config = SMConfig.get_conf() polarity = metadata['MS_Analysis']['Polarity'] polarity_sign = {'Positive': '+', 'Negative': '-'}[polarity] instrument = _normalize_instrument(metadata['MS_Analysis']['Analyzer']) resolving_power = metadata['MS_Analysis']['Detector_Resolving_Power'] rp_mz = float(resolving_power['mz']) rp_resolution = float(resolving_power['Resolving_Power']) if instrument == 'FTICR': rp200 = rp_resolution * rp_mz / 200.0 elif instrument == 'Orbitrap': rp200 = rp_resolution * (rp_mz / 200.0)**0.5 else: rp200 = rp_resolution if rp200 < 85000: params = RESOL_POWER_PARAMS['70K'] elif rp200 < 120000: params = RESOL_POWER_PARAMS['100K'] elif rp200 < 195000: params = RESOL_POWER_PARAMS['140K'] elif rp200 < 265000: params = RESOL_POWER_PARAMS['250K'] elif rp200 < 390000: params = RESOL_POWER_PARAMS['280K'] elif rp200 < 625000: params = RESOL_POWER_PARAMS['500K'] elif rp200 < 875000: params = RESOL_POWER_PARAMS['750K'] else: params = RESOL_POWER_PARAMS['1000K'] default_adducts = sm_config['ds_config_defaults']['adducts'][polarity_sign] charge = {'+': 1, '-': -1}[polarity_sign] isocalc_sigma = float(f"{params['sigma']:f}") return default_adducts, charge, isocalc_sigma, instrument
def run(ds_id, sql_where): conf = SMConfig.get_conf() db = DB(conf['db']) img_store = ImageStoreServiceWrapper(conf['services']['img_service_url']) if sql_where: ds_ids = [ id for (id,) in db.select(f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}') ] else: ds_ids = ds_id.split(',') if not ds_ids: logger.warning('No datasets match filter') return for i, ds_id in enumerate(ds_ids): try: logger.info(f'[{i+1} / {len(ds_ids)}] Updating acq geometry for {ds_id}') ds = Dataset.load(db, ds_id) (sample_img_id,) = db.select_one( "SELECT iim.iso_image_ids[1] from job j " "JOIN iso_image_metrics iim on j.id = iim.job_id " "WHERE j.ds_id = %s LIMIT 1", [ds_id], ) print(sample_img_id) if sample_img_id: w, h = img_store.get_image_by_id('fs', 'iso_image', sample_img_id).size dims = (h, w) # n_cols, n_rows else: dims = (None, None) acq_geometry = make_acq_geometry('ims', None, ds.metadata, dims) ds.save_acq_geometry(db, acq_geometry) except Exception: logger.error(f'Failed on {ds_id}', exc_info=True)
def generate_ion_thumbnail_lithops( executor: Executor, db, ds: Dataset, only_if_needed=False, algorithm=DEFAULT_ALGORITHM, ): try: (existing_thumb_id,) = db.select_one(THUMB_SEL, [ds.id]) if existing_thumb_id and only_if_needed: return annotation_rows = db.select(ISO_IMAGE_SEL, [ds.id]) if not annotation_rows: logger.warning('Could not create ion thumbnail - no annotations found') return ds_id = ds.id sm_config = SMConfig.get_conf() def generate(annotation_rows): return _generate_ion_thumbnail_image( image_storage.ImageStorage(sm_config), ds_id, annotation_rows, algorithm ) thumbnail = executor.call( generate, (annotation_rows,), runtime_memory=2048, include_modules=['png'] ) image_id = _save_ion_thumbnail_image(ds.id, thumbnail) image_url = image_storage.get_image_url(image_storage.THUMB, ds.id, image_id) db.alter(THUMB_UPD, [image_id, image_url, ds.id]) if existing_thumb_id: image_storage.delete_image(image_storage.THUMB, ds.id, existing_thumb_id) except Exception: logger.error('Error generating ion thumbnail image', exc_info=True)
def _post_images_to_image_store(self, ion_images_rdd, alpha_channel, n_peaks): logger.info('Posting iso images to image store') png_generator = PngGenerator(alpha_channel, greyscale=True) ds_id = self.ds_id sm_config = SMConfig.get_conf() def generate_png_and_post(partition): image_storage = ImageStorage(sm_config) for formula_i, imgs in partition: iso_image_ids = [None] * n_peaks for k, img in enumerate(imgs): if img is not None: img_bytes = png_generator.generate_png(img.toarray()) iso_image_ids[k] = image_storage.post_image( image_storage.ISO, ds_id, img_bytes) yield formula_i, iso_image_ids return dict( ion_images_rdd.mapPartitions(generate_png_and_post).collect())
def __init__(self, sc, isocalc): """ Args: sc (SparkContext): isocalc (IsocalcWrapper): """ self._sc = sc self._isocalc = isocalc self._sm_config = SMConfig.get_conf() self._parquet_chunks_n = 64 self._iso_gen_part_n = 512 if self._isocalc.analysis_version < 2: self._ion_centroids_path = '{}/{}/{}/{}'.format( self._sm_config['isotope_storage']['path'], self._isocalc.n_peaks, self._isocalc.sigma, self._isocalc.charge, ) else: self._ion_centroids_path = '{}/v{}/{}/{}_{}/{}'.format( self._sm_config['isotope_storage']['path'], self._isocalc.analysis_version, self._isocalc.n_peaks, self._isocalc.instrument, self._isocalc.sigma, self._isocalc.charge, ) self._parquet_file_names = ['centroids.parquet', 'formulas.parquet'] self._centroids_stored_on_s3 = self._ion_centroids_path.startswith('s3a://') if self._centroids_stored_on_s3: self._local_ion_centroids_path = Path('/tmp') else: self._local_ion_centroids_path = Path(self._ion_centroids_path) Path(self._local_ion_centroids_path).mkdir(parents=True, exist_ok=True) self._s3 = get_s3_client()
def __init__( # pylint: disable=too-many-arguments self, *, id: str, # pylint: disable=redefined-builtin name: str, input_path: str, upload_dt: datetime, metadata: Dict, config: DSConfig, status: str = DatasetStatus.QUEUED, status_update_dt: datetime = None, is_public: bool = True, ): self.id = id self.name = name self.input_path = input_path self.upload_dt = upload_dt self.status = status self.status_update_dt = status_update_dt or datetime.now() self.is_public = is_public self.metadata = metadata self.config = config self._sm_config = SMConfig.get_conf()
def __init__(self, db): self._db = db self._sm_config = SMConfig.get_conf()
def get_s3_resource(sm_config: Dict = None): return boto3.resource( 's3', **_boto_client_kwargs(sm_config or SMConfig.get_conf()))
def __init__(self, es_config=None): if not es_config: es_config = SMConfig.get_conf()['elasticsearch'] self._es = init_es_conn(es_config) self._ind_client = IndicesClient(self._es)
def __init__(self, dbconfig=None): self._dbconfig = dbconfig or SMConfig.get_conf()['db']
create_subparser = subparsers.add_parser('create') create_subparser.add_argument('--drop', action='store_true', help='Delete existing index if exists') swap_subparser = subparsers.add_parser( 'swap', help='Swap the active and inactive indexes') drop_subparser = subparsers.add_parser( 'drop', help='Drop the index. Can only be used on the inactive index') status_subparser = subparsers.add_parser('status', help='Show current index mapping') args = parser.parse_args() SMConfig.set_path(args.config_path) init_loggers(SMConfig.get_conf()['logs']) es_config = SMConfig.get_conf()['elasticsearch'] es_man = ESIndexManager(es_config) alias = es_config['index'] active_index = es_man.internal_index_name(alias) inactive_index = es_man.another_index_name(active_index) index = inactive_index if args.inactive else active_index if args.action == 'create': if args.drop: es_man.delete_index(index) es_man.create_index(index) if not args.inactive: es_man.remap_alias(index, alias)