def build_ranking(group_i, ranking_i, database, modifier, adduct, id, storage): print("Building ranking...") print(f'job_i: {id}') print(f'ranking_i: {ranking_i}') print(f'database: {database}') print(f'modifier: {modifier}') print(f'adduct: {adduct}') # For every unmodified formula in `database`, look up the MSM score for the molecule # that it would become after the modifier and adduct are applied mols = pickle.loads(read_object_with_retry(storage, bucket, database)) if adduct is not None: # Target rankings use the same adduct for all molecules mol_formulas = list( map(safe_generate_ion_formula, mols, repeat(modifier), repeat(adduct))) else: # Decoy rankings use a consistent random adduct for each molecule, chosen so that it doesn't overlap # with other decoy rankings for this molecule adducts = _get_random_adduct_set(len(mols), decoy_adducts, ranking_i) mol_formulas = list( map(safe_generate_ion_formula, mols, repeat(modifier), adducts)) formula_to_id = {} keys = list_keys(bucket, f'{input_db["formula_to_id_chunks"]}/', storage) for key in keys: formula_to_id_chunk = read_object_with_retry( storage, bucket, key, msgpack_load_text) for formula in mol_formulas: if formula_to_id_chunk.get(formula) is not None: formula_to_id[formula] = formula_to_id_chunk.get(formula) formula_is = [ formula and formula_to_id.get(formula) for formula in mol_formulas ] msm = [ formula_i and msm_lookup.get(formula_i) for formula_i in formula_is ] if adduct is not None: ranking_df = pd.DataFrame({ 'mol': mols, 'msm': msm }, index=formula_is) ranking_df = ranking_df[~ranking_df.msm.isna()] else: # Specific molecules don't matter in the decoy rankings, only their msm distribution ranking_df = pd.DataFrame({'msm': msm}) ranking_df = ranking_df[~ranking_df.msm.isna()] return id, storage.put_cobject(pickle.dumps(ranking_df))
def run_ranking(ibm_cos, data_bucket, target_key, decoy_key): target = pickle.loads(read_object_with_retry(ibm_cos, data_bucket, target_key)) decoy = pickle.loads(read_object_with_retry(ibm_cos, data_bucket, decoy_key)) merged = pd.concat([target.assign(is_target=1), decoy.assign(is_target=0)], sort=False) merged = merged.sort_values('msm', ascending=False) decoy_cumsum = (merged.is_target == False).cumsum() target_cumsum = merged.is_target.cumsum() base_fdr = np.clip(decoy_cumsum / target_cumsum, 0, 1) base_fdr[np.isnan(base_fdr)] = 1 target_fdrs = merged.assign(fdr=base_fdr)[lambda df: df.is_target == 1] target_fdrs = target_fdrs.drop('is_target', axis=1) target_fdrs = target_fdrs.sort_values('msm') target_fdrs = target_fdrs.assign(fdr=np.minimum.accumulate(target_fdrs.fdr)) target_fdrs = target_fdrs.sort_index() return target_fdrs
def process_centr_segment(obj, ibm_cos, internal_storage): print(f'Reading centroids segment {obj.key}') # read database relevant part try: centr_df = pd.read_msgpack(obj.data_stream) except: centr_df = read_object_with_retry(ibm_cos, obj.bucket, obj.key, pd.read_msgpack) # find range of datasets first_ds_segm_i, last_ds_segm_i = choose_ds_segments(ds_segments_bounds, centr_df, ppm) print(f'Reading dataset segments {first_ds_segm_i}-{last_ds_segm_i}') # read all segments in loop from COS sp_arr = read_ds_segments(ds_bucket, ds_segm_prefix, first_ds_segm_i, last_ds_segm_i, ds_segms_len[first_ds_segm_i:last_ds_segm_i+1], pw_mem_mb, ds_segm_size_mb, ds_segm_dtype, ibm_cos) formula_images_it = gen_iso_images(sp_inds=sp_arr[:,0], sp_mzs=sp_arr[:,1], sp_ints=sp_arr[:,2], centr_df=centr_df, nrows=nrows, ncols=ncols, ppm=ppm, min_px=1) safe_mb = 1024 max_formula_images_mb = (pw_mem_mb - safe_mb - (last_ds_segm_i - first_ds_segm_i + 1) * ds_segm_size_mb) // 3 print(f'Max formula_images size: {max_formula_images_mb} mb') images_manager = ImagesManager(internal_storage, output_bucket, max_formula_images_mb * 1024 ** 2) formula_image_metrics(formula_images_it, compute_metrics, images_manager) images_cloud_objs = images_manager.finish() print(f'Centroids segment {obj.key} finished') formula_metrics_df = pd.DataFrame.from_dict(images_manager.formula_metrics, orient='index') formula_metrics_df.index.name = 'formula_i' return formula_metrics_df, images_cloud_objs
def read_ds_segment(ds_segm_key): data = read_object_with_retry(ibm_cos, ds_bucket, ds_segm_key, msgpack.load) if type(data) == list: sp_arr = np.concatenate(data) else: sp_arr = data return sp_arr
def deduplicate_formulas_segment(segm_i, storage, clean=True): print(f'Deduplicating formulas segment {segm_i}') keys = list_keys(bucket, f'{formulas_chunks_prefix}/chunk/{segm_i}/', storage) segm = set() for key in keys: segm_formulas_chunk = pickle.loads(read_object_with_retry(storage, bucket, key)) segm.update(segm_formulas_chunk) if clean: clean_from_cos(config, bucket, f'{formulas_chunks_prefix}/chunk/{segm_i}/', storage) return segm
def clean(self, database=True, dataset=True, hard=False): unique_prefixes = [] if not hard: if database: unique_prefixes.append(self.prefixes[':db']) if dataset: unique_prefixes.append(self.prefixes[':ds']) if database or dataset: unique_prefixes.append(self.prefixes[':ds/:db']) else: unique_prefixes.append(self.prefixes['']) keys = [ key for prefix in unique_prefixes for key in self.storage.list_keys(self.bucket, prefix) ] cobjects_to_clean = [] for cache_key in keys: cache_data = read_object_with_retry(self.storage, self.bucket, cache_key, deserialise) if isinstance(cache_data, tuple): for obj in cache_data: if isinstance(obj, list): if isinstance(obj[0], CloudObject): cobjects_to_clean.extend(obj) elif isinstance(obj, CloudObject): cobjects_to_clean.append(obj) elif isinstance(cache_data, list): if isinstance(cache_data[0], CloudObject): cobjects_to_clean.extend(cache_data) elif isinstance(cache_data, CloudObject): cobjects_to_clean.append(cache_data) self.storage.delete_cloudobjects(cobjects_to_clean) for prefix in unique_prefixes: keys = self.storage.list_keys(self.bucket, prefix) if keys: self.storage.delete_objects(self.bucket, keys) logger.info( f'Removed {len(keys)} objects from {self.storage.backend}://{self.bucket}/{prefix}' )
def _get_mols(mols_key): return pickle.loads( read_object_with_retry(ibm_cos, bucket, mols_key))
def _get(key): formula_chunk = read_object_with_retry(ibm_cos, bucket, key, pd.read_msgpack) formula_to_id_chunk = dict( zip(formula_chunk.formula, formula_chunk.index)) return formula_to_id_chunk
def _get_mols(mols_key): return pickle.loads(read_object_with_retry(storage, bucket, mols_key))
def _merge(key): segm_centr_df_chunk = read_object_with_retry(ibm_cos, bucket, key, pd.read_msgpack) return segm_centr_df_chunk
def _merge(key): segm_spectra_chunk = read_object_with_retry(ibm_cos, bucket, key, msgpack.load) return segm_spectra_chunk