def upload_chunk(ch_i, storage): chunk_sp_inds = chunks[ch_i] # Get imzml_reader from COS because it's too big to include via Lithops captured vars imzml_reader = read_cloud_object_with_retry(storage, imzml_reader_cobject, deserialise) n_spectra = sum(imzml_reader.mzLengths[sp_i] for sp_i in chunk_sp_inds) sp_mz_int_buf = np.zeros((n_spectra, 3), dtype=imzml_reader.mzPrecision) chunk_start = 0 for sp_i, mzs, ints in get_spectra(storage, ibd_cobject, imzml_reader, chunk_sp_inds): chunk_end = chunk_start + len(mzs) sp_mz_int_buf[chunk_start:chunk_end, 0] = sp_id_to_idx[sp_i] sp_mz_int_buf[chunk_start:chunk_end, 1] = mzs sp_mz_int_buf[chunk_start:chunk_end, 2] = ints chunk_start = chunk_end by_mz = np.argsort(sp_mz_int_buf[:, 1]) sp_mz_int_buf = sp_mz_int_buf[by_mz] del by_mz chunk = serialise(sp_mz_int_buf) size = sys.getsizeof(chunk) * (1 / 1024**2) logger.info(f'Uploading spectra chunk {ch_i} - %.2f MB' % size) chunk_cobject = storage.put_cloudobject(chunk) logger.info(f'Spectra chunk {ch_i} finished') return chunk_cobject
def _first_level_segment_upload(segm_i): l = ds_segments_bounds[segm_i][0, 0] r = ds_segments_bounds[segm_i][-1, 1] segm_start, segm_end = np.searchsorted( sp_mz_int_buf[:, 1], (l, r)) # mz expected to be in column 1 segm = sp_mz_int_buf[segm_start:segm_end] return storage.put_cloudobject(serialise(segm))
def merge_spectra_chunk_segments(segm_cobjects, id, storage): print(f'Merging segment {id} spectra chunks') def _merge(ch_i): segm_spectra_chunk = read_cloud_object_with_retry( storage, segm_cobjects[ch_i], deserialise) return segm_spectra_chunk with ThreadPoolExecutor(max_workers=128) as pool: segm = list(pool.map(_merge, range(len(segm_cobjects)))) segm = np.concatenate(segm) # Alternative in-place sorting (slower) : # segm.view(f'{ds_segm_dtype},{ds_segm_dtype},{ds_segm_dtype}').sort(order=['f1'], axis=0) segm = segm[segm[:, 1].argsort()] bounds_list = ds_segments_bounds[id] segms_len = [] segms_cobjects = [] for segm_j in range(len(bounds_list)): l, r = bounds_list[segm_j] segm_start, segm_end = np.searchsorted( segm[:, 1], (l, r)) # mz expected to be in column 1 sub_segm = segm[segm_start:segm_end] segms_len.append(len(sub_segm)) base_id = sum([len(bounds) for bounds in ds_segments_bounds[:id]]) segm_i = base_id + segm_j print(f'Storing dataset segment {segm_i}') segms_cobjects.append(storage.put_cloudobject(serialise(sub_segm))) return segms_len, segms_cobjects
def save_images(self): if self.formula_images: print(f'Saving {len(self.formula_images)} images') cloud_obj = self._storage.put_cloudobject( serialise(self.formula_images)) self.cloud_objs.append(cloud_obj) self._partition += 1 else: print(f'No images to save')
def build_ranking(group_i, ranking_i, database, modifier, adduct, id, storage): print("Building ranking...") print(f'job_i: {id}') print(f'ranking_i: {ranking_i}') print(f'database: {database}') print(f'modifier: {modifier}') print(f'adduct: {adduct}') # For every unmodified formula in `database`, look up the MSM score for the molecule # that it would become after the modifier and adduct are applied mols = read_cloud_object_with_retry(storage, mol_db_path_to_cobj[database], deserialise) if adduct is not None: # Target rankings use the same adduct for all molecules mol_formulas = list( map(safe_generate_ion_formula, mols, repeat(modifier), repeat(adduct))) else: # Decoy rankings use a consistent random adduct for each molecule, chosen so that it doesn't overlap # with other decoy rankings for this molecule adducts = _get_random_adduct_set(len(mols), decoy_adducts, ranking_i) mol_formulas = list( map(safe_generate_ion_formula, mols, repeat(modifier), adducts)) formula_to_id = {} for cobject in formula_to_id_cobjects: formula_to_id_chunk = read_cloud_object_with_retry( storage, cobject, deserialise) for formula in mol_formulas: if formula_to_id_chunk.get(formula) is not None: formula_to_id[formula] = formula_to_id_chunk.get(formula) formula_is = [ formula and formula_to_id.get(formula) for formula in mol_formulas ] msm = [ formula_i and msm_lookup.get(formula_i) for formula_i in formula_is ] if adduct is not None: ranking_df = pd.DataFrame({ 'mol': mols, 'msm': msm }, index=formula_is) ranking_df = ranking_df[~ranking_df.msm.isna()] else: # Specific molecules don't matter in the decoy rankings, only their msm distribution ranking_df = pd.DataFrame({'msm': msm}) ranking_df = ranking_df[~ranking_df.msm.isna()] return id, storage.put_cloudobject(serialise(ranking_df))
def _upload(segm_i): segm = pd.concat([ deserialise_from_file( ds_segments_path / f'ds_segm_{segm_i:04}_{chunk_i:04}') for chunk_i in range(chunks_n) ], ignore_index=True, sort=False) segm.sort_values('mz', inplace=True) segm.reset_index(drop=True, inplace=True) segm = serialise(segm) logger.debug( f'Uploading segment {segm_i}: {segm.getbuffer().nbytes} bytes') return storage.put_cloudobject(segm)
def clip_centr_df_chunk(peaks_i, peaks_cobject, storage): print(f'Clipping centroids dataframe chunk {peaks_i}') centroids_df_chunk = deserialise( storage.get_cloudobject(peaks_cobject, stream=True)).sort_values('mz') centroids_df_chunk = centroids_df_chunk[centroids_df_chunk.mz > 0] ds_mz_range_unique_formulas = centroids_df_chunk[ (mz_min < centroids_df_chunk.mz) & (centroids_df_chunk.mz < mz_max)].index.unique() centr_df_chunk = centroids_df_chunk[centroids_df_chunk.index.isin( ds_mz_range_unique_formulas)].reset_index() clip_centr_chunk_cobject = storage.put_cloudobject( serialise(centr_df_chunk)) return clip_centr_chunk_cobject, centr_df_chunk.shape[0]
def calculate_peaks_chunk(segm_i, segm_cobject, storage): print(f'Calculating peaks from formulas chunk {segm_i}') chunk_df = deserialise( storage.get_cloudobject(segm_cobject, stream=True)) peaks = [ peak for formula_i, formula in chunk_df.items() for peak in calculate_peaks_for_formula(formula_i, formula) ] peaks_df = pd.DataFrame(peaks, columns=['formula_i', 'peak_i', 'mz', 'int']) peaks_df.set_index('formula_i', inplace=True) print(f'Storing centroids chunk {id}') peaks_cobject = storage.put_cloudobject(serialise(peaks_df)) return peaks_cobject, peaks_df.shape[0]
def store_formula_to_id_chunk(ch_i, input_cobjects, storage): print(f'Storing formula_to_id dictionary chunk {ch_i}') def _get(cobj): formula_chunk = read_cloud_object_with_retry( storage, cobj, deserialise) formula_to_id_chunk = dict( zip(formula_chunk.values, formula_chunk.index)) return formula_to_id_chunk formula_to_id = {} with ThreadPoolExecutor(max_workers=128) as pool: for chunk_dict in pool.map(_get, input_cobjects): formula_to_id.update(chunk_dict) return storage.put_cloudobject(serialise(formula_to_id))
def _store(segm_i): id = chunk_i * n_threads + segm_i print(f'Storing formulas segment {id}') return storage.put_cloudobject(serialise(segm_list[segm_i]))
def save(self, data, key): self.storage.put_object(self.bucket, self.resolve_key(key), serialise(data))
def _upload(path): mol_sfs = sorted(set(pd.read_csv(path).sf)) return storage.put_cloudobject(serialise(mol_sfs))
def _second_level_upload(df): return storage.put_cloudobject(serialise(df))
def _first_level_upload(args): segm_i, df = args del df['segm_i'] return segm_i, storage.put_cloudobject(serialise(df))
def _store(chunk_i): return chunk_i, storage.put_cloudobject( serialise(formulas_chunks[chunk_i]))
def get_portable_imzml_reader(storage): imzml_stream = storage.get_cloudobject(imzml_cobject, stream=True) parser = ImzMLParser(imzml_stream, ibd_file=None) imzml_reader = parser.portable_spectrum_reader() imzml_reader_cobject = storage.put_cloudobject(serialise(imzml_reader)) return imzml_reader, imzml_reader_cobject
def _store_db_data(db_data): return storage.put_cloudobject(serialise(db_data))
def _store(segm): return storage.put_cloudobject(serialise(segm))