def build_ranking(group_i, ranking_i, database, modifier, adduct, id, storage): print("Building ranking...") print(f'job_i: {id}') print(f'ranking_i: {ranking_i}') print(f'database: {database}') print(f'modifier: {modifier}') print(f'adduct: {adduct}') # For every unmodified formula in `database`, look up the MSM score for the molecule # that it would become after the modifier and adduct are applied mols = read_cloud_object_with_retry(storage, mol_db_path_to_cobj[database], deserialise) if adduct is not None: # Target rankings use the same adduct for all molecules mol_formulas = list( map(safe_generate_ion_formula, mols, repeat(modifier), repeat(adduct))) else: # Decoy rankings use a consistent random adduct for each molecule, chosen so that it doesn't overlap # with other decoy rankings for this molecule adducts = _get_random_adduct_set(len(mols), decoy_adducts, ranking_i) mol_formulas = list( map(safe_generate_ion_formula, mols, repeat(modifier), adducts)) formula_to_id = {} for cobject in formula_to_id_cobjects: formula_to_id_chunk = read_cloud_object_with_retry( storage, cobject, deserialise) for formula in mol_formulas: if formula_to_id_chunk.get(formula) is not None: formula_to_id[formula] = formula_to_id_chunk.get(formula) formula_is = [ formula and formula_to_id.get(formula) for formula in mol_formulas ] msm = [ formula_i and msm_lookup.get(formula_i) for formula_i in formula_is ] if adduct is not None: ranking_df = pd.DataFrame({ 'mol': mols, 'msm': msm }, index=formula_is) ranking_df = ranking_df[~ranking_df.msm.isna()] else: # Specific molecules don't matter in the decoy rankings, only their msm distribution ranking_df = pd.DataFrame({'msm': msm}) ranking_df = ranking_df[~ranking_df.msm.isna()] return id, storage.put_cloudobject(serialise(ranking_df))
def get_first_peak_mz(cobject, id, storage): print( f'Extracting first peak mz values from clipped centroids dataframe {id}' ) centr_df = read_cloud_object_with_retry(storage, cobject, deserialise) first_peak_df = centr_df[centr_df.peak_i == 0] return first_peak_df.mz.values
def upload_chunk(ch_i, storage): chunk_sp_inds = chunks[ch_i] # Get imzml_reader from COS because it's too big to include via pywren captured vars imzml_reader = pickle.loads( read_cloud_object_with_retry(storage, imzml_cobject)) n_spectra = sum(imzml_reader.mzLengths[sp_i] for sp_i in chunk_sp_inds) sp_mz_int_buf = np.zeros((n_spectra, 3), dtype=imzml_reader.mzPrecision) chunk_start = 0 for sp_i, mzs, ints in get_spectra(ibd_path, imzml_reader, chunk_sp_inds): chunk_end = chunk_start + len(mzs) sp_mz_int_buf[chunk_start:chunk_end, 0] = sp_id_to_idx[sp_i] sp_mz_int_buf[chunk_start:chunk_end, 1] = mzs sp_mz_int_buf[chunk_start:chunk_end, 2] = ints chunk_start = chunk_end by_mz = np.argsort(sp_mz_int_buf[:, 1]) sp_mz_int_buf = sp_mz_int_buf[by_mz] del by_mz chunk = msgpack.dumps(sp_mz_int_buf) size = sys.getsizeof(chunk) * (1 / 1024**2) logger.info(f'Uploading spectra chunk {ch_i} - %.2f MB' % size) chunk_cobject = storage.put_cobject(chunk) logger.info(f'Spectra chunk {ch_i} finished') return chunk_cobject
def read_ds_segment(cobject): data = read_cloud_object_with_retry(storage, cobject, msgpack.load) if type(data) == list: sp_arr = np.concatenate(data) else: sp_arr = data return sp_arr
def get_target_images(images_cobject, storage): images = {} segm_images = pickle.loads( read_cloud_object_with_retry(storage, images_cobject)) for k, v in segm_images.items(): if k in targets: images[k] = v return images
def deduplicate_formulas_chunk(chunk_i, chunk_cobjects, storage): print(f'Deduplicating formulas chunk {chunk_i}') chunk = set() for cobject in chunk_cobjects: formulas_chunk_part = read_cloud_object_with_retry( storage, cobject, deserialise) chunk.update(formulas_chunk_part) return chunk
def run_ranking(target_cobject, decoy_cobject, storage): target = read_cloud_object_with_retry(storage, target_cobject, deserialise) decoy = read_cloud_object_with_retry(storage, decoy_cobject, deserialise) merged = pd.concat( [target.assign(is_target=1), decoy.assign(is_target=0)], sort=False) merged = merged.sort_values('msm', ascending=False) decoy_cumsum = (merged.is_target == False).cumsum() target_cumsum = merged.is_target.cumsum() base_fdr = np.clip(decoy_cumsum / target_cumsum, 0, 1) base_fdr[np.isnan(base_fdr)] = 1 target_fdrs = merged.assign(fdr=base_fdr)[lambda df: df.is_target == 1] target_fdrs = target_fdrs.drop('is_target', axis=1) target_fdrs = target_fdrs.sort_values('msm') target_fdrs = target_fdrs.assign( fdr=np.minimum.accumulate(target_fdrs.fdr)) target_fdrs = target_fdrs.sort_index() return target_fdrs
def get_target_images(images_cobject, storage): images = {} segm_images = read_cloud_object_with_retry(storage, images_cobject, deserialise) for k, imgs in segm_images.items(): if k in targets: if only_first_isotope: imgs = imgs[:1] if as_png: imgs = [ to_png(img, mask) if img is not None else None for img in imgs ] images[k] = imgs return images
def segment_centr_chunk(cobject, id, storage): print(f'Segmenting clipped centroids dataframe chunk {id}') centr_df = read_cloud_object_with_retry(storage, cobject, deserialise) centr_segm_df = segment_centr_df(centr_df, first_level_centr_segm_bounds) def _first_level_upload(args): segm_i, df = args del df['segm_i'] return segm_i, storage.put_cloudobject(serialise(df)) with ThreadPoolExecutor(max_workers=128) as pool: sub_segms = [(segm_i, df) for segm_i, df in centr_segm_df.groupby('segm_i')] sub_segms_cobjects = list(pool.map(_first_level_upload, sub_segms)) return dict(sub_segms_cobjects)
def read_ds_segment(cobject, hybrid_impl, storage): data = read_cloud_object_with_retry(storage, cobject, deserialise) if isinstance(data, list): if isinstance(data[0], np.ndarray): data = np.concatenate(data) else: data = pd.concat(data, ignore_index=True, sort=False) if isinstance(data, np.ndarray): data = pd.DataFrame({ 'mz': data[:, 1], 'int': data[:, 2], 'sp_i': data[:, 0], }) return data
def run_fdr(db_data_cobject): db, fdr, formula_map_df = read_cloud_object_with_retry( storage, db_data_cobject, deserialise) formula_msm = formula_map_df.merge(msms_df, how='inner', left_on='formula_i', right_index=True) modifiers = fdr.target_modifiers_df[[ 'neutral_loss', 'adduct' ]].rename(columns={'neutral_loss': 'modifier'}) results_df = (fdr.estimate_fdr(formula_msm).assign( database_path=db).set_index('formula_i').rename(columns={ 'modifier': 'combined_modifier', 'formula': 'mol' }).merge(modifiers, left_on='combined_modifier', right_index=True).drop(columns=['combined_modifier'])) return results_df
def segment_spectra_chunk(chunk_cobject, id, storage): print(f'Segmenting spectra chunk {id}') sp_mz_int_buf = read_cloud_object_with_retry(storage, chunk_cobject, msgpack.load) def _first_level_segment_upload(segm_i): l = ds_segments_bounds[segm_i][0, 0] r = ds_segments_bounds[segm_i][-1, 1] segm_start, segm_end = np.searchsorted( sp_mz_int_buf[:, 1], (l, r)) # mz expected to be in column 1 segm = sp_mz_int_buf[segm_start:segm_end] return storage.put_cobject(msgpack.dumps(segm)) with ThreadPoolExecutor(max_workers=128) as pool: sub_segms_cobjects = list( pool.map(_first_level_segment_upload, range(len(ds_segments_bounds)))) return sub_segms_cobjects
def process_centr_segment(db_segm_cobject, id, storage): print(f'Reading centroids segment {id}') # read database relevant part centr_df = read_cloud_object_with_retry(storage, db_segm_cobject, deserialise) # find range of datasets first_ds_segm_i, last_ds_segm_i = choose_ds_segments( ds_segments_bounds, centr_df, ppm) print(f'Reading dataset segments {first_ds_segm_i}-{last_ds_segm_i}') # read all segments in loop from COS sp_arr = read_ds_segments( ds_segms_cobjects[first_ds_segm_i:last_ds_segm_i + 1], ds_segms_len[first_ds_segm_i:last_ds_segm_i + 1], pw_mem_mb, ds_segm_size_mb, ds_segm_dtype, hybrid_impl, storage) formula_images_it = gen_iso_images(sp_inds=sp_arr.sp_i.values, sp_mzs=sp_arr.mz.values, sp_ints=sp_arr.int.values, centr_df=centr_df, nrows=nrows, ncols=ncols, ppm=ppm, min_px=1) if hybrid_impl: safe_mb = pw_mem_mb // 2 else: safe_mb = 1024 max_formula_images_mb = ( pw_mem_mb - safe_mb - (last_ds_segm_i - first_ds_segm_i + 1) * ds_segm_size_mb) // 3 print(f'Max formula_images size: {max_formula_images_mb} mb') images_manager = ImagesManager(storage, max_formula_images_mb * 1024**2) formula_image_metrics(formula_images_it, compute_metrics, images_manager) images_cloud_objs = images_manager.finish() print(f'Centroids segment {id} finished') formula_metrics_df = pd.DataFrame.from_dict( images_manager.formula_metrics, orient='index') formula_metrics_df.index.name = 'formula_i' return formula_metrics_df, images_cloud_objs
def get_segm_bounds(storage): imzml_reader = pickle.loads( read_cloud_object_with_retry(storage, imzml_cobject)) sp_n = len(imzml_reader.coordinates) sample_sp_inds = np.random.choice(np.arange(sp_n), min(sp_n, sample_n)) print(f'Sampling {len(sample_sp_inds)} spectra') spectra_sample = list( get_spectra(ibd_url, imzml_reader, sample_sp_inds)) spectra_mzs = np.concatenate( [mzs for sp_id, mzs, ints in spectra_sample]) print(f'Got {len(spectra_mzs)} mzs') total_size = 3 * spectra_mzs.nbytes * sp_n / len(sample_sp_inds) segm_n = int(np.ceil(total_size / (ds_segm_size_mb * 2**20))) segm_bounds_q = [i * 1 / segm_n for i in range(0, segm_n + 1)] segm_lower_bounds = [ np.quantile(spectra_mzs, q) for q in segm_bounds_q ] return np.array( list(zip(segm_lower_bounds[:-1], segm_lower_bounds[1:])))
def _get(cobj): formula_chunk = read_cloud_object_with_retry( storage, cobj, deserialise) formula_to_id_chunk = dict( zip(formula_chunk.values, formula_chunk.index)) return formula_to_id_chunk
def _get_mols(mols_cobj): return read_cloud_object_with_retry(storage, mols_cobj, deserialise)
def _merge(ch_i): segm_spectra_chunk = read_cloud_object_with_retry( storage, segm_cobjects[ch_i], deserialise) return segm_spectra_chunk
def _merge(cobject): segm_centr_df_chunk = read_cloud_object_with_retry( storage, cobject, pd.read_msgpack) return segm_centr_df_chunk
def _merge(ch_i): segm_spectra_chunk = read_cloud_object_with_retry( storage, segm_cobjects[ch_i], msgpack.load) return segm_spectra_chunk
def _merge(cobject): segm_centr_df_chunk = read_cloud_object_with_retry( storage, cobject, deserialise) return segm_centr_df_chunk