def define_centr_segments( fexec: Executor, clip_centr_chunks_cobjs: List[CloudObject], centr_n: int, ds_size_mb: int, ): logger.info('Defining centroids segments bounds') def get_first_peak_mz(idx, cobject, *, storage): print( f'Extracting first peak mz values from clipped centroids dataframe {idx}' ) centr_df = load_cobj(storage, cobject) first_peak_df = centr_df[centr_df.peak_i == 0] return first_peak_df.mz.values first_peak_df_mz = np.concatenate( fexec.map(get_first_peak_mz, list(enumerate(clip_centr_chunks_cobjs)), runtime_memory=512)) data_per_centr_segm_mb = 50 peaks_per_centr_segm = 10000 centr_segm_n = int( max(ds_size_mb // data_per_centr_segm_mb, centr_n // peaks_per_centr_segm, MIN_CENTR_SEGMS)) segm_bounds_q = [i * 1 / centr_segm_n for i in range(0, centr_segm_n)] centr_segm_lower_bounds = np.quantile(first_peak_df_mz, segm_bounds_q) logger.info( f'Generated {len(centr_segm_lower_bounds)} centroids bounds: ' f'{centr_segm_lower_bounds[0]}...{centr_segm_lower_bounds[-1]}') return centr_segm_lower_bounds
def store_images_to_s3( executor: Executor, ds_id: str, formula_i_to_db_id: pd.Series, png_cobjs: List[CObj[List[Tuple[int, bytes]]]], ) -> DbFormulaImagesDict: """ Upload PNG isotopic images to S3 image storage. Images may be uploaded multiple times if a formula_i is in multiple databases (i.e. there are duplicates in the formula_i_to_db_id index). This is intentional, as there's no check for reused images when deleting individual dataset jobs e.g. by removing a moldb without reprocessing. It's easier to just avoid ever reusing images. """ sm_config = SMConfig.get_conf() def _upload_png_batch(png_cobj: CObj[List[Tuple[int, bytes]]], *, storage: Storage, perf: SubtaskProfiler): def _upload_images(pngs): return [ image_storage.post_image(image_storage.ISO, ds_id, png) if png is not None else None for png in pngs ] formula_png_chunk = load_cobj(storage, png_cobj) image_storage = ImageStorage(sm_config) n_images = 0 tasks = (pd.DataFrame(formula_png_chunk, columns=['formula_i', 'pngs']).set_index('formula_i').join( formula_i_to_db_id, how='inner')) # Limit parallelism to 6 to avoid accidentally hitting S3's upload limit (3500 PUTs/s) # The default parallelism (8 threads, because Cloud Functions get 4 CPUs) is slightly # too high for datasets with a lot of images. with ThreadPoolExecutor(6) as executor: db_formula_image_ids: DbFormulaImagesDict = defaultdict(dict) for db_id, formula_id, image_ids in zip( tasks.moldb_id, tasks.index, executor.map(_upload_images, tasks.pngs)): db_formula_image_ids[db_id][formula_id] = image_ids n_images += len([i for i in image_ids if i is not None]) perf.add_extra_data(n_tasks=len(tasks), n_images=n_images) return db_formula_image_ids results = executor.map(_upload_png_batch, [(cobj, ) for cobj in png_cobjs], runtime_memory=512) db_formula_image_ids: DbFormulaImagesDict = defaultdict(dict) for result in results: for db_id, db_result in result.items(): db_formula_image_ids[db_id].update(db_result) return db_formula_image_ids
def filter_results_and_make_pngs( fexec: Executor, formula_metrics_df: pd.DataFrame, moldbs: List[InputMolDb], fdrs: Dict[int, pd.DataFrame], images_df: pd.DataFrame, imzml_reader: LithopsImzMLReader, ): results_dfs = {} all_formula_is = set() for moldb_id, fdr in fdrs.items(): result_df = ( # Drop any columns already in fdr, as the FDR results may add or overwrite columns # with values from the scoring function. formula_metrics_df.drop(columns=fdr.columns, errors='ignore').join( fdr, how='inner').sort_values('fdr')) # Filter out zero-MSM annotations again to ensure that untargeted databases don't get # zero-MSM annotations, even if they have some overlap with targeted databases. is_targeted = any(db['targeted'] for db in moldbs if db['id'] == moldb_id) if not is_targeted: result_df = result_df[(result_df.msm > 0) & (result_df.fdr < 1)] results_dfs[moldb_id] = result_df all_formula_is.update(results_dfs[moldb_id].index) image_tasks_df = images_df[images_df.index.isin(all_formula_is)].copy() jobs = _split_png_jobs(image_tasks_df, imzml_reader.w, imzml_reader.h) png_generator = PngGenerator(imzml_reader.mask) def save_png_chunk(df: pd.DataFrame, *, storage: Storage): pngs = [] groups = defaultdict(lambda: []) for formula_i, cobj in df.cobj.items(): groups[cobj].append(formula_i) image_dict_iter = iter_cobjs_with_prefetch(storage, list(groups.keys())) for image_dict, formula_is in zip(image_dict_iter, groups.values()): for formula_i in formula_is: formula_pngs = [ png_generator.generate_png(img.toarray()) if img is not None else None for img in image_dict[formula_i] ] pngs.append((formula_i, formula_pngs)) return save_cobj(storage, pngs) png_cobjs = fexec.map(save_png_chunk, jobs, include_modules=['png'], runtime_memory=1024) return results_dfs, png_cobjs
def run_fdr( executor: Executor, formula_scores_df: pd.DataFrame, db_data_cobjs: List[CObj[DbFDRData]], ds_config: DSConfig, ) -> Dict[int, pd.DataFrame]: def _run_fdr_for_db(db_data_cobject: CObj[DbFDRData], *, storage: Storage): print(f'Loading FDR data from {db_data_cobject}') db_data = load_cobj(storage, db_data_cobject) moldb_id = db_data['id'] fdr = db_data['fdr'] formula_map_df = db_data['formula_map_df'] formula_msm = formula_map_df.merge(formula_scores_df, how='inner', left_on='formula_i', right_index=True) modifiers = fdr.target_modifiers_df[[ 'chem_mod', 'neutral_loss', 'adduct' ]] results_df = (fdr.estimate_fdr(formula_msm, scoring_model).assign( moldb_id=moldb_id).set_index('formula_i').merge(modifiers, left_on='modifier', right_index=True, how='outer')) return db_data['id'], results_df logger.info('Estimating FDRs...') scoring_model = load_scoring_model(ds_config['fdr'].get('scoring_model')) args = [(db_data_cobj, ) for db_data_cobj in db_data_cobjs] results = executor.map(_run_fdr_for_db, args, runtime_memory=2048) for moldb_id, moldb_fdrs in results: logger.info(f'DB {moldb_id} number of annotations with FDR less than:') for fdr_step in [0.05, 0.1, 0.2, 0.5]: logger.info( f'{fdr_step * 100:2.0f}%: {(moldb_fdrs.fdr <= fdr_step).sum()}' ) return dict(results)
def segment_centroids( fexec: Executor, peaks_cobjs: List[CObj[pd.DataFrame]], ds_segms_cobjs: List[CObj[pd.DataFrame]], ds_segms_bounds: np.ndarray, ds_segm_size_mb: int, is_intensive_dataset: bool, isocalc_wrapper: IsocalcWrapper, ) -> List[CObj[pd.DataFrame]]: # pylint: disable=too-many-locals mz_min, mz_max = ds_segms_bounds[0, 0], ds_segms_bounds[-1, 1] clip_centr_chunks_cobjs, centr_n = clip_centr_df(fexec, peaks_cobjs, mz_min, mz_max) # define first level segmentation and then segment each one into desired number centr_segm_lower_bounds = define_centr_segments( fexec, clip_centr_chunks_cobjs, centr_n, len(ds_segms_cobjs) * ds_segm_size_mb, ) first_level_centr_segm_n = min(32, len(centr_segm_lower_bounds)) centr_segm_lower_bounds = np.array_split(centr_segm_lower_bounds, first_level_centr_segm_n) first_level_centr_segm_bounds = np.array( [bounds[0] for bounds in centr_segm_lower_bounds]) def segment_centr_df(centr_df, db_segm_lower_bounds): first_peak_df = centr_df[centr_df.peak_i == 0].copy() segment_mapping = (np.searchsorted( db_segm_lower_bounds, first_peak_df.mz.values, side='right') - 1) first_peak_df['segm_i'] = segment_mapping centr_segm_df = pd.merge(centr_df, first_peak_df[['formula_i', 'segm_i']], on='formula_i').sort_values('mz') return centr_segm_df def segment_centr_chunk(idx, cobject, *, storage): print(f'Segmenting clipped centroids dataframe chunk {idx}') centr_df = load_cobj(storage, cobject) centr_segm_df = segment_centr_df(centr_df, first_level_centr_segm_bounds) def _first_level_upload(args): segm_i, df = args del df['segm_i'] return segm_i, save_cobj(storage, df) with ThreadPoolExecutor(max_workers=128) as pool: sub_segms = list(centr_segm_df.groupby('segm_i')) sub_segms_cobjs = list(pool.map(_first_level_upload, sub_segms)) return dict(sub_segms_cobjs) first_level_segms_cobjs = fexec.map( segment_centr_chunk, list(enumerate(clip_centr_chunks_cobjs)), runtime_memory=1024) def merge_centr_df_segments(segm_i, segm_cobjects, *, storage): print(f'Merging segment {segm_i} clipped centroids chunks') # Temporarily index by formula_i for faster filtering when saving segm = pd.concat(load_cobjs(storage, segm_cobjects)).set_index('formula_i') formula_segms_df = choose_ds_segments_per_formula( ds_segms_bounds, segm, isocalc_wrapper) # Try to balance formulas so that they all span roughly the same number of DS segments, # and have roughly the same number of formulas. max_segm_span = max((formula_segms_df.hi - formula_segms_df.lo).max(), 3) if is_intensive_dataset: max_segm_count = int(round(np.clip(centr_n / 1000, 1000, 5000))) else: max_segm_count = int(round(np.clip(centr_n / 1000, 1000, 15000))) formula_i_groups = [] segm_lo_idx = 0 while segm_lo_idx < len(formula_segms_df): max_segm_hi = formula_segms_df.lo[segm_lo_idx] + max_segm_span + 1 max_span_idx = np.searchsorted(formula_segms_df.hi, max_segm_hi, 'left') segm_hi_idx = min(segm_lo_idx + max_segm_count, max_span_idx, len(formula_segms_df)) formula_i_groups.append( formula_segms_df.formula_i.values[segm_lo_idx:segm_hi_idx]) print(segm_lo_idx, segm_hi_idx) segm_lo_idx = segm_hi_idx def _second_level_upload(formula_is): return save_cobj( storage, segm.loc[formula_is].sort_values('mz').reset_index()) print(f'Storing {len(formula_i_groups)} centroids segments') with ThreadPoolExecutor(max_workers=4) as pool: segms_cobjects = list( pool.map(_second_level_upload, formula_i_groups)) return segms_cobjects second_level_segms_dict = defaultdict(list) for sub_segms_cobjs in first_level_segms_cobjs: for first_level_segm_i in sub_segms_cobjs: second_level_segms_dict[first_level_segm_i].append( sub_segms_cobjs[first_level_segm_i]) second_level_segms_cobjs = sorted(second_level_segms_dict.items(), key=lambda x: x[0]) first_level_cobjs = [ co for cos in first_level_segms_cobjs for co in cos.values() ] db_segms_cobjs = fexec.map_concat(merge_centr_df_segments, second_level_segms_cobjs, runtime_memory=512) fexec.storage.delete_cloudobjects(first_level_cobjs) return db_segms_cobjs
def validate_centroids(fexec: Executor, peaks_cobjs: List[CObj[pd.DataFrame]]): # Ignore code duplicated with validate_centroid_segments as the duplicated parts of the code # are too entangled with non-duplicated parts of the code def warn(message, df=None): warnings.append(message) logger.warning(message) if df: logger.warning(df) def get_segm_stats(segm_cobject: CObj[pd.DataFrame], *, storage: Storage): segm = load_cobj(storage, segm_cobject) n_peaks = segm.groupby(level='formula_i').peak_i.count() formula_is = segm.index.unique() stats = pd.Series( { 'min_mz': segm.mz.min(), 'max_mz': segm.mz.max(), 'min_formula_i': segm.index.min(), 'max_formula_i': segm.index.max(), 'avg_n_peaks': n_peaks.mean(), 'min_n_peaks': n_peaks.min(), 'max_n_peaks': n_peaks.max(), 'max_int': segm.int.max(), 'missing_peaks': ( segm.loc[n_peaks.index[n_peaks != 4]] .groupby(level='formula_i') .peak_i.apply(lambda peak_is: len(set(range(len(peak_is))) - set(peak_is))) .sum() ), 'n_formulas': len(formula_is), 'n_peaks': len(segm), } ) return formula_is, stats warnings: List[str] = [] results = fexec.map(get_segm_stats, [(co,) for co in peaks_cobjs], runtime_memory=1024) segm_formula_is = [formula_is for formula_is, stats in results] stats_df = pd.DataFrame([stats for formula_is, stats in results]) with pd.option_context( 'display.max_rows', None, 'display.max_columns', None, 'display.width', 1000 ): # Report cases with fewer peaks than expected (indication that formulas are being # split between multiple segments) wrong_n_peaks = stats_df[ (stats_df.avg_n_peaks < 3.9) | (stats_df.min_n_peaks < 2) | (stats_df.max_n_peaks > 4) ] if not wrong_n_peaks.empty: warn( 'segment_centroids produced segments with unexpected peaks-per-formula ' '(should be almost always 4, occasionally 2 or 3):', wrong_n_peaks, ) # Report missing peaks missing_peaks = stats_df[stats_df.missing_peaks > 0] if not missing_peaks.empty: warn('segment_centroids produced segments with missing peaks:', missing_peaks) formula_in_segms_df = validate_formulas_not_in_multiple_segms(segm_formula_is, warn) logger.debug( f'Found {stats_df.n_peaks.sum()} peaks for {stats_df.n_formulas.sum()} formulas ' f'across {len(peaks_cobjs)} segms' ) n_per_segm = formula_in_segms_df.groupby('segm_i').formula_i.count() logger.debug(f'Segm sizes range from {n_per_segm.min()} to {n_per_segm.max()}') if warnings: try: __import__('__main__').stats_df = stats_df print('validate_centroids debug info written to "stats_df" variable') except Exception: pass raise AssertionError('Some checks failed in validate_centroids')