def executor(sm_config): from sm.engine.annotation_lithops.executor import Executor executor = Executor(sm_config['lithops'], debug_run_locally=True) yield executor executor.clean() for bucket, prefix in sm_config['lithops']['sm_storage'].values(): keys = executor.storage.list_keys(bucket, prefix) if keys: executor.storage.delete_objects(bucket, keys)
def run(sm_config, ds_id_str, sql_where, algorithm, use_lithops): db = DB() if sql_where: ds_ids = [ id for (id, ) in db.select( f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}') ] else: ds_ids = ds_id_str.split(',') if not ds_ids: logger.warning('No datasets match filter') return if use_lithops: executor = Executor(sm_config['lithops']) for i, ds_id in enumerate(ds_ids): try: logger.info( f'[{i+1} / {len(ds_ids)}] Generating ion thumbnail for {ds_id}' ) ds = Dataset.load(db, ds_id) if use_lithops: # noinspection PyUnboundLocalVariable generate_ion_thumbnail_lithops(executor, db, ds, algorithm=algorithm) else: generate_ion_thumbnail(db, ds, algorithm=algorithm) except Exception: logger.error(f'Failed on {ds_id}', exc_info=True)
def define_centr_segments( fexec: Executor, clip_centr_chunks_cobjs: List[CloudObject], centr_n: int, ds_size_mb: int, ): logger.info('Defining centroids segments bounds') def get_first_peak_mz(idx, cobject, *, storage): print( f'Extracting first peak mz values from clipped centroids dataframe {idx}' ) centr_df = load_cobj(storage, cobject) first_peak_df = centr_df[centr_df.peak_i == 0] return first_peak_df.mz.values first_peak_df_mz = np.concatenate( fexec.map(get_first_peak_mz, list(enumerate(clip_centr_chunks_cobjs)), runtime_memory=512)) data_per_centr_segm_mb = 50 peaks_per_centr_segm = 10000 centr_segm_n = int( max(ds_size_mb // data_per_centr_segm_mb, centr_n // peaks_per_centr_segm, MIN_CENTR_SEGMS)) segm_bounds_q = [i * 1 / centr_segm_n for i in range(0, centr_segm_n)] centr_segm_lower_bounds = np.quantile(first_peak_df_mz, segm_bounds_q) logger.info( f'Generated {len(centr_segm_lower_bounds)} centroids bounds: ' f'{centr_segm_lower_bounds[0]}...{centr_segm_lower_bounds[-1]}') return centr_segm_lower_bounds
def load_ds( executor: Executor, imzml_cobject: CloudObject, ibd_cobject: CloudObject, ds_segm_size_mb: int ) -> Tuple[LithopsImzMLReader, np.ndarray, List[CObj[pd.DataFrame]], np.ndarray,]: try: ibd_head = executor.storage.head_object(ibd_cobject.bucket, ibd_cobject.key) ibd_size_mb = int(ibd_head['content-length']) / 1024 // 1024 except Exception: logger.warning("Couldn't read ibd size", exc_info=True) ibd_size_mb = 1024 # Guess the amount of memory needed. For the majority of datasets (no zero-intensity peaks, # separate m/z arrays per spectrum) approximately 3x the ibd file size is used during the # most memory-intense part (sorting the m/z array). if ibd_size_mb * 3 + 512 < 32 * 1024: logger.info(f'Found {ibd_size_mb}MB .ibd file. Trying serverless load_ds') runtime_memory = max(2048, int(2 ** np.ceil(np.log2(ibd_size_mb * 3 + 512)))) else: logger.info(f'Found {ibd_size_mb}MB .ibd file. Using VM-based load_ds') runtime_memory = 128 * 1024 imzml_reader, ds_segments_bounds, ds_segms_cobjs, ds_segm_lens = executor.call( _load_ds, (imzml_cobject, ibd_cobject, ds_segm_size_mb), runtime_memory=runtime_memory, ) logger.info(f'Segmented dataset chunks into {len(ds_segms_cobjs)} segments') return imzml_reader, ds_segments_bounds, ds_segms_cobjs, ds_segm_lens
def reprocess_dataset_local(sm_src, src_ds_id, dst_ds_id, update_metadata_func, skip_existing=True, use_cache=False): existing = get_dataset_diagnostics(dst_ds_id) if skip_existing and existing: print(f'Skipping {dst_ds_id}\n', end=None) return dst_ds_id, None smds = sm_src.dataset(id=src_ds_id) db = DB() ds_metadata, ds_config = update_metadata_func(smds.metadata, smds.config) ds = Dataset( id=dst_ds_id, name=smds.name, input_path=smds.s3dir, upload_dt=datetime.now(), metadata=ds_metadata, config=ds_config, status=DatasetStatus.QUEUED, status_update_dt=None, is_public=False, ) ds.save(db, None, True) with perf_profile(db, 'annotate_lithops', dst_ds_id) as perf: executor = Executor(SMConfig.get_conf()['lithops'], perf=perf) job = ServerAnnotationJob(executor, ds, perf, use_cache=use_cache) job.pipe.use_db_cache = False job.run() return dst_ds_id
def clip_centr_df(fexec: Executor, peaks_cobjs: List[CloudObject], mz_min: float, mz_max: float) -> Tuple[List[CObj[pd.DataFrame]], int]: def clip_centr_df_chunk(peaks_i, peaks_cobject, storage): print(f'Clipping centroids dataframe chunk {peaks_i}') centroids_df_chunk = load_cobj(storage, peaks_cobject).sort_values('mz') centroids_df_chunk = centroids_df_chunk[centroids_df_chunk.mz > 0] ds_mz_range_unique_formulas = centroids_df_chunk[ (mz_min < centroids_df_chunk.mz) & (centroids_df_chunk.mz < mz_max)].index.unique() centr_df_chunk = centroids_df_chunk[centroids_df_chunk.index.isin( ds_mz_range_unique_formulas)].reset_index() clip_centr_chunk_cobject = save_cobj(storage, centr_df_chunk) return clip_centr_chunk_cobject, centr_df_chunk.shape[0] assert len(peaks_cobjs) > 0 clip_centr_chunks_cobjs, centr_n = fexec.map_unpack( clip_centr_df_chunk, list(enumerate(peaks_cobjs)), runtime_memory=512, ) clip_centr_chunks_cobjs = list(clip_centr_chunks_cobjs) centr_n = sum(centr_n) logger.info(f'Prepared {centr_n} centroids') return clip_centr_chunks_cobjs, centr_n
def __init__( self, imzml_cobject: CloudObject, ibd_cobject: CloudObject, moldbs: List[InputMolDb], ds_config: DSConfig, executor: Executor = None, lithops_config=None, cache_key=None, use_db_cache=True, use_db_mutex=True, ): lithops_config = lithops_config or SMConfig.get_conf()['lithops'] self.lithops_config = lithops_config self._db = DB() self.imzml_cobject = imzml_cobject self.ibd_cobject = ibd_cobject self.moldbs = moldbs self.ds_config = ds_config self.isocalc_wrapper = IsocalcWrapper(ds_config) self.executor = executor or Executor(lithops_config) self.storage = self.executor.storage if cache_key is not None: self.cacher: Optional[PipelineCacher] = PipelineCacher( self.storage, cache_key, lithops_config) else: self.cacher = None self.use_db_cache = use_db_cache self.use_db_mutex = use_db_mutex self.ds_segm_size_mb = 128
def store_images_to_s3( executor: Executor, ds_id: str, formula_i_to_db_id: pd.Series, png_cobjs: List[CObj[List[Tuple[int, bytes]]]], ) -> DbFormulaImagesDict: """ Upload PNG isotopic images to S3 image storage. Images may be uploaded multiple times if a formula_i is in multiple databases (i.e. there are duplicates in the formula_i_to_db_id index). This is intentional, as there's no check for reused images when deleting individual dataset jobs e.g. by removing a moldb without reprocessing. It's easier to just avoid ever reusing images. """ sm_config = SMConfig.get_conf() def _upload_png_batch(png_cobj: CObj[List[Tuple[int, bytes]]], *, storage: Storage, perf: SubtaskProfiler): def _upload_images(pngs): return [ image_storage.post_image(image_storage.ISO, ds_id, png) if png is not None else None for png in pngs ] formula_png_chunk = load_cobj(storage, png_cobj) image_storage = ImageStorage(sm_config) n_images = 0 tasks = (pd.DataFrame(formula_png_chunk, columns=['formula_i', 'pngs']).set_index('formula_i').join( formula_i_to_db_id, how='inner')) # Limit parallelism to 6 to avoid accidentally hitting S3's upload limit (3500 PUTs/s) # The default parallelism (8 threads, because Cloud Functions get 4 CPUs) is slightly # too high for datasets with a lot of images. with ThreadPoolExecutor(6) as executor: db_formula_image_ids: DbFormulaImagesDict = defaultdict(dict) for db_id, formula_id, image_ids in zip( tasks.moldb_id, tasks.index, executor.map(_upload_images, tasks.pngs)): db_formula_image_ids[db_id][formula_id] = image_ids n_images += len([i for i in image_ids if i is not None]) perf.add_extra_data(n_tasks=len(tasks), n_images=n_images) return db_formula_image_ids results = executor.map(_upload_png_batch, [(cobj, ) for cobj in png_cobjs], runtime_memory=512) db_formula_image_ids: DbFormulaImagesDict = defaultdict(dict) for result in results: for db_id, db_result in result.items(): db_formula_image_ids[db_id].update(db_result) return db_formula_image_ids
def filter_results_and_make_pngs( fexec: Executor, formula_metrics_df: pd.DataFrame, moldbs: List[InputMolDb], fdrs: Dict[int, pd.DataFrame], images_df: pd.DataFrame, imzml_reader: LithopsImzMLReader, ): results_dfs = {} all_formula_is = set() for moldb_id, fdr in fdrs.items(): result_df = ( # Drop any columns already in fdr, as the FDR results may add or overwrite columns # with values from the scoring function. formula_metrics_df.drop(columns=fdr.columns, errors='ignore').join( fdr, how='inner').sort_values('fdr')) # Filter out zero-MSM annotations again to ensure that untargeted databases don't get # zero-MSM annotations, even if they have some overlap with targeted databases. is_targeted = any(db['targeted'] for db in moldbs if db['id'] == moldb_id) if not is_targeted: result_df = result_df[(result_df.msm > 0) & (result_df.fdr < 1)] results_dfs[moldb_id] = result_df all_formula_is.update(results_dfs[moldb_id].index) image_tasks_df = images_df[images_df.index.isin(all_formula_is)].copy() jobs = _split_png_jobs(image_tasks_df, imzml_reader.w, imzml_reader.h) png_generator = PngGenerator(imzml_reader.mask) def save_png_chunk(df: pd.DataFrame, *, storage: Storage): pngs = [] groups = defaultdict(lambda: []) for formula_i, cobj in df.cobj.items(): groups[cobj].append(formula_i) image_dict_iter = iter_cobjs_with_prefetch(storage, list(groups.keys())) for image_dict, formula_is in zip(image_dict_iter, groups.values()): for formula_i in formula_is: formula_pngs = [ png_generator.generate_png(img.toarray()) if img is not None else None for img in image_dict[formula_i] ] pngs.append((formula_i, formula_pngs)) return save_cobj(storage, pngs) png_cobjs = fexec.map(save_png_chunk, jobs, include_modules=['png'], runtime_memory=1024) return results_dfs, png_cobjs
def run_fdr( executor: Executor, formula_scores_df: pd.DataFrame, db_data_cobjs: List[CObj[DbFDRData]], ds_config: DSConfig, ) -> Dict[int, pd.DataFrame]: def _run_fdr_for_db(db_data_cobject: CObj[DbFDRData], *, storage: Storage): print(f'Loading FDR data from {db_data_cobject}') db_data = load_cobj(storage, db_data_cobject) moldb_id = db_data['id'] fdr = db_data['fdr'] formula_map_df = db_data['formula_map_df'] formula_msm = formula_map_df.merge(formula_scores_df, how='inner', left_on='formula_i', right_index=True) modifiers = fdr.target_modifiers_df[[ 'chem_mod', 'neutral_loss', 'adduct' ]] results_df = (fdr.estimate_fdr(formula_msm, scoring_model).assign( moldb_id=moldb_id).set_index('formula_i').merge(modifiers, left_on='modifier', right_index=True, how='outer')) return db_data['id'], results_df logger.info('Estimating FDRs...') scoring_model = load_scoring_model(ds_config['fdr'].get('scoring_model')) args = [(db_data_cobj, ) for db_data_cobj in db_data_cobjs] results = executor.map(_run_fdr_for_db, args, runtime_memory=2048) for moldb_id, moldb_fdrs in results: logger.info(f'DB {moldb_id} number of annotations with FDR less than:') for fdr_step in [0.05, 0.1, 0.2, 0.5]: logger.info( f'{fdr_step * 100:2.0f}%: {(moldb_fdrs.fdr <= fdr_step).sum()}' ) return dict(results)
def annotate_lithops(self, ds: Dataset, del_first=False): if del_first: self.logger.warning(f'Deleting all results for dataset: {ds.id}') del_jobs(ds) ds.save(self._db, self._es) with perf_profile(self._db, 'annotate_lithops', ds.id) as perf: executor = Executor(self._sm_config['lithops'], perf=perf) ServerAnnotationJob(executor, ds, perf).run() if self._sm_config['services'].get('colocalization', True): Colocalization(self._db).run_coloc_job_lithops(executor, ds, reprocess=del_first) if self._sm_config['services'].get('ion_thumbnail', True): generate_ion_thumbnail_lithops( executor=executor, db=self._db, ds=ds, only_if_needed=not del_first, )
def build_moldb( executor: Executor, ds_config: DSConfig, moldbs: List[InputMolDb], ) -> Tuple[List[CObj[pd.DataFrame]], List[CObj[DbFDRData]]]: def _build_moldb( *, storage: Storage, perf: Profiler ) -> Tuple[List[CObj[pd.DataFrame]], List[CObj[DbFDRData]]]: logger.info('Generating formulas...') db_data_cobjs, formulas_df = get_formulas_df(storage, ds_config, moldbs) num_formulas = len(formulas_df) perf.record_entry('generated formulas', num_formulas=num_formulas) logger.info('Storing formulas...') formula_cobjs = store_formula_segments(storage, formulas_df) perf.record_entry('stored formulas', num_chunks=len(formula_cobjs)) return formula_cobjs, db_data_cobjs return executor.call(_build_moldb, (), runtime_memory=4096)
def generate_ion_thumbnail_lithops( executor: Executor, db, ds: Dataset, only_if_needed=False, algorithm=DEFAULT_ALGORITHM, ): try: (existing_thumb_id,) = db.select_one(THUMB_SEL, [ds.id]) if existing_thumb_id and only_if_needed: return annotation_rows = db.select(ISO_IMAGE_SEL, [ds.id]) if not annotation_rows: logger.warning('Could not create ion thumbnail - no annotations found') return ds_id = ds.id sm_config = SMConfig.get_conf() def generate(annotation_rows): return _generate_ion_thumbnail_image( image_storage.ImageStorage(sm_config), ds_id, annotation_rows, algorithm ) thumbnail = executor.call( generate, (annotation_rows,), runtime_memory=2048, include_modules=['png'] ) image_id = _save_ion_thumbnail_image(ds.id, thumbnail) image_url = image_storage.get_image_url(image_storage.THUMB, ds.id, image_id) db.alter(THUMB_UPD, [image_id, image_url, ds.id]) if existing_thumb_id: image_storage.delete_image(image_storage.THUMB, ds.id, existing_thumb_id) except Exception: logger.error('Error generating ion thumbnail image', exc_info=True)
def run_coloc_job_lithops(self, fexec: Executor, ds: Dataset, reprocess: bool = False): # Extract required fields to avoid pickling Dataset, because unpickling Dataset tries to # import psycopg2 and fails inside Functions ds_id = ds.id sm_config = self._sm_config def run_coloc_job(moldb_id, image_ids, ion_ids, fdrs, *, storage): # Use web_app_url to get the publicly-exposed storage server address, because # Functions can't use the private address images, h, w = _get_images(ImageStorage(sm_config), ds_id, image_ids) cobjs = [] for job in analyze_colocalization(ds_id, moldb_id, images, ion_ids, fdrs, h, w): cobjs.append(save_cobj(storage, job)) return cobjs tasks = list(self._iter_pending_coloc_tasks(ds.id, reprocess)) cost_factors = pd.DataFrame({'n_images': [len(task[1]) for task in tasks]}) job_cobjs = fexec.map_concat( run_coloc_job, tasks, cost_factors=cost_factors, runtime_memory=4096 ) for job in iter_cobjs_with_prefetch(fexec.storage, job_cobjs): self._save_job_to_db(job)
def run_search(self, store_images=False, use_lithops=False): if not store_images: self._patch_image_storage() moldb_id = molecular_db.find_by_name_version(self.moldb['name'], self.moldb['version']).id os.environ['PYSPARK_PYTHON'] = sys.executable ds = create_ds_from_files(self.ds_id, self.ds_name, self.input_path) ds.config['analysis_version'] = self.analysis_version ds.config['fdr'][ 'scoring_model'] = 'v3_default' if self.analysis_version > 1 else None ds.config['database_ids'] = [moldb_id] self.db.alter('DELETE FROM job WHERE ds_id=%s', params=(ds.id, )) ds.save(self.db, allow_insert=True) perf = NullProfiler() if use_lithops: # Override the runtime to force it to run without docker. lithops_executor.RUNTIME_CF_VPC = 'python' lithops_executor.RUNTIME_CE = 'python' executor = Executor(self.sm_config['lithops'], perf) job = ServerAnnotationJob( executor, ds, perf, self.sm_config, store_images=store_images, ) job.run(debug_validate=True) else: AnnotationJob(ds, perf).run() self.make_comparison_df()
def calculate_centroids( fexec: Executor, formula_cobjs: List[CObj[pd.DataFrame]], isocalc_wrapper: IsocalcWrapper ) -> List[CObj[pd.DataFrame]]: def calculate_peaks_for_formula(args): formula_i, formula, target, targeted = args mzs, ints = isocalc_wrapper.centroids(formula) if mzs is not None: return [ (formula_i, peak_i, mzs[peak_i], ints[peak_i], target, targeted) for peak_i in range(len(mzs)) ] return [] def calculate_peaks_chunk(segm_i: int, segm_cobject: CObj[pd.DataFrame], *, storage: Storage): print(f'Calculating peaks from formulas chunk {segm_i}') chunk_df = load_cobj(storage, segm_cobject) chunk_iter = chunk_df[['ion_formula', 'target', 'targeted']].itertuples(True, None) peaks = list(chain(*map(calculate_peaks_for_formula, chunk_iter))) peaks_df = pd.DataFrame( peaks, columns=['formula_i', 'peak_i', 'mz', 'int', 'target', 'targeted'] ) peaks_df = peaks_df.astype( { 'formula_i': 'u4', 'peak_i': 'u1', 'mz': 'f8', 'int': 'f4', 'target': '?', 'targeted': '?', } ) peaks_df.set_index('formula_i', inplace=True) print(f'Storing centroids chunk {segm_i}') peaks_cobject = save_cobj(storage, peaks_df) return peaks_cobject, peaks_df.shape[0] peaks_cobjs, peaks_cobject_lens = fexec.map_unpack( calculate_peaks_chunk, list(enumerate(formula_cobjs)), runtime_memory=2048, ) num_centroids = sum(peaks_cobject_lens) logger.info(f'Calculated {num_centroids} centroids in {len(peaks_cobjs)} chunks') def _sort_peaks_cobjs(*, storage): df = pd.concat(load_cobjs(storage, peaks_cobjs)) first_peak_mz = df.mz[df.peak_i == 0].sort_values() peaks_chunk_size = 64 * 2 ** 20 n_chunks = int(np.ceil(df.memory_usage().sum() / peaks_chunk_size)) cnt = len(first_peak_mz) chunks = ( df.loc[first_peak_mz.index[cnt * i // n_chunks : cnt * (i + 1) // n_chunks]] for i in range(n_chunks) ) return save_cobjs(storage, chunks) sorted_peaks_cobjs = fexec.call( _sort_peaks_cobjs, (), cost_factors={'num_centroids': num_centroids, 'num_peak_cobjects': len(peaks_cobjs)}, runtime_memory=256 + 100 * num_centroids / 2 ** 20, ) logger.info(f'Sorted centroids chunks into {len(sorted_peaks_cobjs)} chunks') return sorted_peaks_cobjs
def segment_centroids( fexec: Executor, peaks_cobjs: List[CObj[pd.DataFrame]], ds_segms_cobjs: List[CObj[pd.DataFrame]], ds_segms_bounds: np.ndarray, ds_segm_size_mb: int, is_intensive_dataset: bool, isocalc_wrapper: IsocalcWrapper, ) -> List[CObj[pd.DataFrame]]: # pylint: disable=too-many-locals mz_min, mz_max = ds_segms_bounds[0, 0], ds_segms_bounds[-1, 1] clip_centr_chunks_cobjs, centr_n = clip_centr_df(fexec, peaks_cobjs, mz_min, mz_max) # define first level segmentation and then segment each one into desired number centr_segm_lower_bounds = define_centr_segments( fexec, clip_centr_chunks_cobjs, centr_n, len(ds_segms_cobjs) * ds_segm_size_mb, ) first_level_centr_segm_n = min(32, len(centr_segm_lower_bounds)) centr_segm_lower_bounds = np.array_split(centr_segm_lower_bounds, first_level_centr_segm_n) first_level_centr_segm_bounds = np.array( [bounds[0] for bounds in centr_segm_lower_bounds]) def segment_centr_df(centr_df, db_segm_lower_bounds): first_peak_df = centr_df[centr_df.peak_i == 0].copy() segment_mapping = (np.searchsorted( db_segm_lower_bounds, first_peak_df.mz.values, side='right') - 1) first_peak_df['segm_i'] = segment_mapping centr_segm_df = pd.merge(centr_df, first_peak_df[['formula_i', 'segm_i']], on='formula_i').sort_values('mz') return centr_segm_df def segment_centr_chunk(idx, cobject, *, storage): print(f'Segmenting clipped centroids dataframe chunk {idx}') centr_df = load_cobj(storage, cobject) centr_segm_df = segment_centr_df(centr_df, first_level_centr_segm_bounds) def _first_level_upload(args): segm_i, df = args del df['segm_i'] return segm_i, save_cobj(storage, df) with ThreadPoolExecutor(max_workers=128) as pool: sub_segms = list(centr_segm_df.groupby('segm_i')) sub_segms_cobjs = list(pool.map(_first_level_upload, sub_segms)) return dict(sub_segms_cobjs) first_level_segms_cobjs = fexec.map( segment_centr_chunk, list(enumerate(clip_centr_chunks_cobjs)), runtime_memory=1024) def merge_centr_df_segments(segm_i, segm_cobjects, *, storage): print(f'Merging segment {segm_i} clipped centroids chunks') # Temporarily index by formula_i for faster filtering when saving segm = pd.concat(load_cobjs(storage, segm_cobjects)).set_index('formula_i') formula_segms_df = choose_ds_segments_per_formula( ds_segms_bounds, segm, isocalc_wrapper) # Try to balance formulas so that they all span roughly the same number of DS segments, # and have roughly the same number of formulas. max_segm_span = max((formula_segms_df.hi - formula_segms_df.lo).max(), 3) if is_intensive_dataset: max_segm_count = int(round(np.clip(centr_n / 1000, 1000, 5000))) else: max_segm_count = int(round(np.clip(centr_n / 1000, 1000, 15000))) formula_i_groups = [] segm_lo_idx = 0 while segm_lo_idx < len(formula_segms_df): max_segm_hi = formula_segms_df.lo[segm_lo_idx] + max_segm_span + 1 max_span_idx = np.searchsorted(formula_segms_df.hi, max_segm_hi, 'left') segm_hi_idx = min(segm_lo_idx + max_segm_count, max_span_idx, len(formula_segms_df)) formula_i_groups.append( formula_segms_df.formula_i.values[segm_lo_idx:segm_hi_idx]) print(segm_lo_idx, segm_hi_idx) segm_lo_idx = segm_hi_idx def _second_level_upload(formula_is): return save_cobj( storage, segm.loc[formula_is].sort_values('mz').reset_index()) print(f'Storing {len(formula_i_groups)} centroids segments') with ThreadPoolExecutor(max_workers=4) as pool: segms_cobjects = list( pool.map(_second_level_upload, formula_i_groups)) return segms_cobjects second_level_segms_dict = defaultdict(list) for sub_segms_cobjs in first_level_segms_cobjs: for first_level_segm_i in sub_segms_cobjs: second_level_segms_dict[first_level_segm_i].append( sub_segms_cobjs[first_level_segm_i]) second_level_segms_cobjs = sorted(second_level_segms_dict.items(), key=lambda x: x[0]) first_level_cobjs = [ co for cos in first_level_segms_cobjs for co in cos.values() ] db_segms_cobjs = fexec.map_concat(merge_centr_df_segments, second_level_segms_cobjs, runtime_memory=512) fexec.storage.delete_cloudobjects(first_level_cobjs) return db_segms_cobjs
def validate_centroids(fexec: Executor, peaks_cobjs: List[CObj[pd.DataFrame]]): # Ignore code duplicated with validate_centroid_segments as the duplicated parts of the code # are too entangled with non-duplicated parts of the code def warn(message, df=None): warnings.append(message) logger.warning(message) if df: logger.warning(df) def get_segm_stats(segm_cobject: CObj[pd.DataFrame], *, storage: Storage): segm = load_cobj(storage, segm_cobject) n_peaks = segm.groupby(level='formula_i').peak_i.count() formula_is = segm.index.unique() stats = pd.Series( { 'min_mz': segm.mz.min(), 'max_mz': segm.mz.max(), 'min_formula_i': segm.index.min(), 'max_formula_i': segm.index.max(), 'avg_n_peaks': n_peaks.mean(), 'min_n_peaks': n_peaks.min(), 'max_n_peaks': n_peaks.max(), 'max_int': segm.int.max(), 'missing_peaks': ( segm.loc[n_peaks.index[n_peaks != 4]] .groupby(level='formula_i') .peak_i.apply(lambda peak_is: len(set(range(len(peak_is))) - set(peak_is))) .sum() ), 'n_formulas': len(formula_is), 'n_peaks': len(segm), } ) return formula_is, stats warnings: List[str] = [] results = fexec.map(get_segm_stats, [(co,) for co in peaks_cobjs], runtime_memory=1024) segm_formula_is = [formula_is for formula_is, stats in results] stats_df = pd.DataFrame([stats for formula_is, stats in results]) with pd.option_context( 'display.max_rows', None, 'display.max_columns', None, 'display.width', 1000 ): # Report cases with fewer peaks than expected (indication that formulas are being # split between multiple segments) wrong_n_peaks = stats_df[ (stats_df.avg_n_peaks < 3.9) | (stats_df.min_n_peaks < 2) | (stats_df.max_n_peaks > 4) ] if not wrong_n_peaks.empty: warn( 'segment_centroids produced segments with unexpected peaks-per-formula ' '(should be almost always 4, occasionally 2 or 3):', wrong_n_peaks, ) # Report missing peaks missing_peaks = stats_df[stats_df.missing_peaks > 0] if not missing_peaks.empty: warn('segment_centroids produced segments with missing peaks:', missing_peaks) formula_in_segms_df = validate_formulas_not_in_multiple_segms(segm_formula_is, warn) logger.debug( f'Found {stats_df.n_peaks.sum()} peaks for {stats_df.n_formulas.sum()} formulas ' f'across {len(peaks_cobjs)} segms' ) n_per_segm = formula_in_segms_df.groupby('segm_i').formula_i.count() logger.debug(f'Segm sizes range from {n_per_segm.min()} to {n_per_segm.max()}') if warnings: try: __import__('__main__').stats_df = stats_df print('validate_centroids debug info written to "stats_df" variable') except Exception: pass raise AssertionError('Some checks failed in validate_centroids')
def run_coloc_jobs( sm_config, ds_id_str, sql_where, fix_missing, fix_corrupt, skip_existing, use_lithops ): assert ( len( [ data_source for data_source in [ds_id_str, sql_where, fix_missing, fix_corrupt] if data_source ] ) == 1 ), "Exactly one data source (ds_id, sql_where, fix_missing, fix_corrupt) must be specified" assert not (ds_id_str and sql_where) db = DB() if ds_id_str: ds_ids = ds_id_str.split(',') elif sql_where: ds_ids = [ id for (id,) in db.select(f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}') ] else: mol_dbs = [ (doc['id'], doc['name']) for doc in db.select_with_fields('SELECT id, name FROM molecular_db m') ] mol_db_ids, mol_db_names = map(list, zip(*mol_dbs)) fdrs = [0.05, 0.1, 0.2, 0.5] algorithms = ['median_thresholded_cosine', 'cosine'] if fix_missing: logger.info('Checking for missing colocalization jobs...') results = db.select( MISSING_COLOC_JOBS_SEL, [list(mol_db_ids), list(mol_db_names), fdrs, algorithms] ) ds_ids = [ds_id for ds_id, in results] logger.info(f'Found {len(ds_ids)} missing colocalization sets') else: logger.info( 'Checking all colocalization jobs. ' 'This is super slow: ~5 minutes per 1000 datasets...' ) results = db.select( CORRUPT_COLOC_JOBS_SEL, [list(mol_db_ids), list(mol_db_names), fdrs, algorithms] ) ds_ids = [ds_id for ds_id, in results] logger.info(f'Found {len(ds_ids)} corrupt colocalization sets') if not ds_ids: logger.warning('No datasets match filter') return if use_lithops: executor = Executor(sm_config['lithops']) for i, ds_id in enumerate(ds_ids): try: logger.info(f'Running colocalization on {i+1} out of {len(ds_ids)}') ds = Dataset.load(db, ds_id) coloc = Colocalization(db) if use_lithops: # noinspection PyUnboundLocalVariable coloc.run_coloc_job_lithops(executor, ds, reprocess=not skip_existing) else: coloc.run_coloc_job(ds, reprocess=not skip_existing) except Exception: logger.error(f'Failed to run colocalization on {ds_id}', exc_info=True)
def test_server_annotation_job(test_db, executor: Executor, sm_config, ds_config, metadata): db = DB() moldb_id = import_test_molecular_db() ds_config['database_ids'] = [moldb_id] ds_config['isotope_generation']['adducts'] = [ '[M]+' ] # test spectrum was made with no adduct # ds_config['isotope_generation']['n_peaks'] = 2 # minimize overlap between decoys and targets ds_config['image_generation'][ 'ppm'] = 0.001 # minimize overlap between decoys and targets ds_config['fdr']['decoy_sample_size'] = len(MOCK_DECOY_ADDUCTS) input_path = upload_test_imzml(executor.storage, sm_config, ds_config) ds = Dataset( id=datetime.now().strftime('%Y-%m-%d_%Hh%Mm%Ss'), name='Test Lithops Dataset', input_path=input_path, upload_dt=datetime.now(), metadata=metadata, config=ds_config, is_public=True, status=DatasetStatus.QUEUED, ) ds.save(db, None, allow_insert=True) with perf_profile(db, 'test_lithops_annotate', ds.id) as perf: # Overwrite executor's NullProfiler with a real profiler executor._perf = perf job = ServerAnnotationJob(executor=executor, ds=ds, perf=perf) job.run(debug_validate=True) def db_df(sql, args): return pd.DataFrame(db.select_with_fields(sql, args)) jobs = db_df('SELECT * FROM job WHERE ds_id = %s', (ds.id, )) anns = db_df( 'SELECT * FROM annotation WHERE job_id = ANY(%s) ORDER BY msm DESC', (jobs.id.tolist(), )) diags = db_df('SELECT * FROM dataset_diagnostic WHERE ds_id = %s', (ds.id, )) profiles = db_df('SELECT * FROM perf_profile WHERE ds_id = %s', (ds.id, )) profile_entries = db_df( 'SELECT * FROM perf_profile_entry WHERE profile_id = ANY(%s)', (profiles.id.tolist(), )) # For debugging annotations / FDR-related issues debug_data = job.pipe.debug_get_annotation_data(MOCK_FORMULAS[0], '') # print(debug_data) # db_data = load_cobjs(executor.storage, job.pipe.db_data_cobjs)[0] # print(db_data) # print(load_cobjs(executor.storage, job.pipe.ds_segms_cobjs)) # moldb = pd.concat(load_cobjs(executor.storage, job.pipe.db_segms_cobjs)) # formula_mzs = moldb.groupby('formula_i').mz.apply(list) # all_metrics = ( # job.pipe.formula_metrics_df.join(db_data['formula_map_df'].set_index('formula_i')) # .join(formula_mzs) # .sort_values('msm', ascending=False) # ) # print(all_metrics) # print(job.pipe.ds_segments_bounds) # print(job.pipe.ds_segm_lens) # print(job.pipe.fdrs) # print(pd.DataFrame(anns)) # print(pd.DataFrame(diags)) # print(pd.DataFrame(profiles)) # print(pd.DataFrame(profile_entries)) # Validate jobs assert len(jobs) == 1 assert jobs.moldb_id[0] == moldb_id # Validate annotations assert np.array_equal(anns.formula, MOCK_FORMULAS) # Formulas should be MSM-descending assert np.array_equal(anns.fdr, [0.05] * 2 + [0.5] * 8) # Validate images were saved image_ids = [imgs[0] for imgs in anns.iso_image_ids] images = image_storage.get_ion_images_for_analysis(ds.id, image_ids)[0] assert images.shape == (len(anns), 4 * 4) # All non-masked pixels should have a value assert np.count_nonzero(images) == len(anns) * len(MOCK_COORDS) # Validate diagnostics metadata_diag = diags[diags.type == DiagnosticType.IMZML_METADATA].iloc[0] tic_diag = diags[diags.type == DiagnosticType.TIC].iloc[0] assert metadata_diag.error is None assert metadata_diag.data['n_spectra'] == len(MOCK_COORDS) assert metadata_diag.images[0]['key'] == DiagnosticImageKey.MASK mask_image = load_npy_image(ds.id, metadata_diag.images[0]['image_id']) assert np.count_nonzero(mask_image) == len(MOCK_COORDS) assert tic_diag.error is None assert tic_diag.data['min_tic'] > 0 assert tic_diag.images[0]['key'] == DiagnosticImageKey.TIC tic_image = load_npy_image(ds.id, tic_diag.images[0]['image_id']) assert tic_image.dtype == np.float32 assert tic_image.shape == (4, 4) assert np.array_equal(np.isnan(tic_image), ~mask_image) # Masked area should be NaNs assert (tic_image[mask_image] > 0).all() # Non-masked area should be non-zero # Validate perf profile assert len(profiles) == 1 assert len(profile_entries) > 10