def upload_chunk(ch_i, storage): chunk_sp_inds = chunks[ch_i] # Get imzml_reader from COS because it's too big to include via pywren captured vars imzml_reader = pickle.loads( read_cloud_object_with_retry(storage, imzml_cobject)) n_spectra = sum(imzml_reader.mzLengths[sp_i] for sp_i in chunk_sp_inds) sp_mz_int_buf = np.zeros((n_spectra, 3), dtype=imzml_reader.mzPrecision) chunk_start = 0 for sp_i, mzs, ints in get_spectra(ibd_path, imzml_reader, chunk_sp_inds): chunk_end = chunk_start + len(mzs) sp_mz_int_buf[chunk_start:chunk_end, 0] = sp_id_to_idx[sp_i] sp_mz_int_buf[chunk_start:chunk_end, 1] = mzs sp_mz_int_buf[chunk_start:chunk_end, 2] = ints chunk_start = chunk_end by_mz = np.argsort(sp_mz_int_buf[:, 1]) sp_mz_int_buf = sp_mz_int_buf[by_mz] del by_mz chunk = msgpack.dumps(sp_mz_int_buf) size = sys.getsizeof(chunk) * (1 / 1024**2) logger.info(f'Uploading spectra chunk {ch_i} - %.2f MB' % size) chunk_cobject = storage.put_cobject(chunk) logger.info(f'Spectra chunk {ch_i} finished') return chunk_cobject
def define_ds_segments(pw, ibd_url, imzml_cobject, ds_segm_size_mb, sample_n): def get_segm_bounds(storage): imzml_reader = pickle.loads( read_cloud_object_with_retry(storage, imzml_cobject)) sp_n = len(imzml_reader.coordinates) sample_sp_inds = np.random.choice(np.arange(sp_n), min(sp_n, sample_n)) print(f'Sampling {len(sample_sp_inds)} spectra') spectra_sample = list( get_spectra(ibd_url, imzml_reader, sample_sp_inds)) spectra_mzs = np.concatenate( [mzs for sp_id, mzs, ints in spectra_sample]) print(f'Got {len(spectra_mzs)} mzs') total_size = 3 * spectra_mzs.nbytes * sp_n / len(sample_sp_inds) segm_n = int(np.ceil(total_size / (ds_segm_size_mb * 2**20))) segm_bounds_q = [i * 1 / segm_n for i in range(0, segm_n + 1)] segm_lower_bounds = [ np.quantile(spectra_mzs, q) for q in segm_bounds_q ] return np.array( list(zip(segm_lower_bounds[:-1], segm_lower_bounds[1:]))) logger.info('Defining dataset segments bounds') memory_capacity_mb = 1024 future = pw.call_async(get_segm_bounds, [], runtime_memory=memory_capacity_mb) ds_segments = pw.get_result(future) append_pywren_stats(future, memory_mb=memory_capacity_mb) return ds_segments
def log_bad_results(merged_results, missing_results, spatial_wrong, spectral_wrong, chaos_wrong, msm_wrong, fdr_error): fdr_any_error = fdr_error[lambda df: df.fdr_error > 0] fdr_big_error = fdr_error[lambda df: df.fdr_error > 1] results = [ # Name, Maximum allowed, Actual value, Extra data ('Missing annotations', 0, len(missing_results), missing_results.head()), # A small number of results are off by up to 1% due to an algorithm change since they were processed # Annotations with fewer than 4 ion images now have slightly higher spatial and spectral score than before ('Incorrect spatial metric', 2, len(spatial_wrong), spatial_wrong.head()), ('Incorrect spectral metric', 5, len(spectral_wrong), spectral_wrong.head()), ('Incorrect chaos metric', 0, len(chaos_wrong), chaos_wrong.head()), ('Incorrect MSM', 2, len(msm_wrong), msm_wrong.head()), # FDR can vary significantly depending on which decoy adducts were chosen. ('FDR changed', len(merged_results) * 0.25, len(fdr_any_error), fdr_any_error.head()), ('FDR changed significantly', len(merged_results) * 0.1, len(fdr_big_error), fdr_big_error.head()), ] failed_results = [] for result_name, threshold, value, data in results: if value <= threshold: logger.info(f'{result_name}: {value} (PASS)') else: logger.error(f'{result_name}: {value} (FAIL)') failed_results.append((result_name, data)) for result_name, data in failed_results: logger.error(f'{result_name} extra info:\n{str(data)}\n') if not failed_results: logger.info('All checks pass') else: logger.error(f'{len(failed_results)} checks failed')
def segment_ds(self): ds_segments_cache_key = f'{self.cacher.prefix}/segment_ds.cache' if self.cacher.exists(ds_segments_cache_key): self.ds_segments_bounds, self.ds_segms_cobjects, self.ds_segms_len = \ self.cacher.load(ds_segments_cache_key) logger.info( f'Loaded {len(self.ds_segms_cobjects)} dataset segments from cache' ) else: sample_sp_n = 1000 self.ds_segments_bounds = define_ds_segments( self.pywren_executor, self.ds_config["ibd_path"], self.imzml_cobject, self.ds_segm_size_mb, sample_sp_n) self.ds_segms_cobjects, self.ds_segms_len = \ segment_spectra(self.pywren_executor, self.ds_chunks_cobjects, self.ds_segments_bounds, self.ds_segm_size_mb, self.imzml_reader.mzPrecision) logger.info( f'Segmented dataset chunks into {len(self.ds_segms_cobjects)} segments' ) self.cacher.save((self.ds_segments_bounds, self.ds_segms_cobjects, self.ds_segms_len), ds_segments_cache_key) self.ds_segm_n = len(self.ds_segms_cobjects) self.is_intensive_dataset = self.ds_segm_n * self.ds_segm_size_mb > 5000
def segment_centroids(self): mz_min, mz_max = self.ds_segments_bounds[ 0, 0], self.ds_segments_bounds[-1, 1] db_segments_cache_key = f'{self.cacher.prefix}/segment_centroids.cache' if self.cacher.exists(db_segments_cache_key): self.clip_centr_chunks_cobjects, self.db_segms_cobjects = self.cacher.load( db_segments_cache_key) logger.info( f'Loaded {len(self.db_segms_cobjects)} centroids segments from cache' ) else: self.clip_centr_chunks_cobjects, centr_n = \ clip_centr_df(self.pywren_executor, self.config["storage"]["db_bucket"], self.db_config["centroids_chunks"], mz_min, mz_max) centr_segm_lower_bounds = define_centr_segments( self.pywren_executor, self.clip_centr_chunks_cobjects, centr_n, self.ds_segm_n, self.ds_segm_size_mb) max_ds_segms_size_per_db_segm_mb = 2560 if self.is_intensive_dataset else 1536 self.db_segms_cobjects = segment_centroids( self.pywren_executor, self.clip_centr_chunks_cobjects, centr_segm_lower_bounds, self.ds_segments_bounds, self.ds_segm_size_mb, max_ds_segms_size_per_db_segm_mb, self.image_gen_config['ppm']) logger.info( f'Segmented centroids chunks into {len(self.db_segms_cobjects)} segments' ) self.cacher.save( (self.clip_centr_chunks_cobjects, self.db_segms_cobjects), db_segments_cache_key) self.centr_segm_n = len(self.db_segms_cobjects)
def calculate_fdrs_vm(storage, formula_scores_df, db_data_cobjects): t = time() msms_df = formula_scores_df[['msm']] def run_fdr(db_data_cobject): db, fdr, formula_map_df = read_cloud_object_with_retry( storage, db_data_cobject, deserialise) formula_msm = formula_map_df.merge(msms_df, how='inner', left_on='formula_i', right_index=True) modifiers = fdr.target_modifiers_df[[ 'neutral_loss', 'adduct' ]].rename(columns={'neutral_loss': 'modifier'}) results_df = (fdr.estimate_fdr(formula_msm).assign( database_path=db).set_index('formula_i').rename(columns={ 'modifier': 'combined_modifier', 'formula': 'mol' }).merge(modifiers, left_on='combined_modifier', right_index=True).drop(columns=['combined_modifier'])) return results_df logger.info('Estimating FDRs...') with ThreadPoolExecutor(os.cpu_count()) as pool: results_dfs = list(pool.map(run_fdr, db_data_cobjects)) exec_time = time() - t return pd.concat(results_dfs), exec_time
def segment_centroids(self, use_cache=True, debug_validate=False): mz_min, mz_max = self.ds_segments_bounds[0, 0], self.ds_segments_bounds[-1, 1] cache_key = ':ds/:db/segment_centroids.cache' if use_cache and self.cacher.exists(cache_key): self.clip_centr_chunks_cobjects, self.db_segms_cobjects = self.cacher.load(cache_key) logger.info(f'Loaded {len(self.db_segms_cobjects)} centroids segments from cache') else: self.clip_centr_chunks_cobjects, centr_n = \ clip_centr_df(self.lithops_executor, self.peaks_cobjects, mz_min, mz_max) centr_segm_lower_bounds = define_centr_segments(self.lithops_executor, self.clip_centr_chunks_cobjects, centr_n, self.ds_segm_n, self.ds_segm_size_mb) max_ds_segms_size_per_db_segm_mb = 2560 if self.is_intensive_dataset else 1536 self.db_segms_cobjects = segment_centroids(self.lithops_executor, self.clip_centr_chunks_cobjects, centr_segm_lower_bounds, self.ds_segments_bounds, self.ds_segm_size_mb, max_ds_segms_size_per_db_segm_mb, self.image_gen_config['ppm']) logger.info(f'Segmented centroids chunks into {len(self.db_segms_cobjects)} segments') self.cacher.save((self.clip_centr_chunks_cobjects, self.db_segms_cobjects), cache_key) self.centr_segm_n = len(self.db_segms_cobjects) if debug_validate: validate_centroid_segments( self.lithops_executor, self.db_segms_cobjects, self.ds_segments_bounds, self.image_gen_config['ppm'] )
def download_dataset(imzml_cobject, ibd_cobject, local_path, storage): def _download(url_or_cobject, path): if isinstance(url_or_cobject, CloudObject): stream = storage.get_cloudobject(url_or_cobject, stream=True) with path.open('wb') as f: copyfileobj(stream, f, 1024 * 1024) else: with requests.get(url_or_cobject, stream=True) as r: r.raise_for_status() with path.open('wb') as f: for chunk in r.iter_content(): f.write(chunk) Path(local_path).mkdir(exist_ok=True) imzml_path = local_path / 'ds.imzML' ibd_path = local_path / 'ds.ibd' logger.info("Download dataset {} - {} ".format(imzml_cobject, imzml_path)) _download(imzml_cobject, imzml_path) logger.info("Download dataset {} - {} ".format(ibd_cobject, ibd_path)) _download(ibd_cobject, ibd_path) imzml_size = imzml_path.stat().st_size / (1024**2) ibd_size = ibd_path.stat().st_size / (1024**2) logger.debug(f'imzML size: {imzml_size:.2f} mb') logger.debug(f'ibd size: {ibd_size:.2f} mb') return imzml_path, ibd_path
def clip_centr_df(pw, peaks_cobjects, mz_min, mz_max): def clip_centr_df_chunk(peaks_i, peaks_cobject, storage): print(f'Clipping centroids dataframe chunk {peaks_i}') centroids_df_chunk = deserialise( storage.get_cloudobject(peaks_cobject, stream=True)).sort_values('mz') centroids_df_chunk = centroids_df_chunk[centroids_df_chunk.mz > 0] ds_mz_range_unique_formulas = centroids_df_chunk[ (mz_min < centroids_df_chunk.mz) & (centroids_df_chunk.mz < mz_max)].index.unique() centr_df_chunk = centroids_df_chunk[centroids_df_chunk.index.isin( ds_mz_range_unique_formulas)].reset_index() clip_centr_chunk_cobject = storage.put_cloudobject( serialise(centr_df_chunk)) return clip_centr_chunk_cobject, centr_df_chunk.shape[0] memory_capacity_mb = 512 futures = pw.map(clip_centr_df_chunk, list(enumerate(peaks_cobjects)), runtime_memory=memory_capacity_mb) clip_centr_chunks_cobjects, centr_n = list(zip(*pw.get_result(futures))) PipelineStats.append_func(futures, memory_mb=memory_capacity_mb, cloud_objects_n=len(futures)) clip_centr_chunks_cobjects = list(clip_centr_chunks_cobjects) centr_n = sum(centr_n) logger.info(f'Prepared {centr_n} centroids') return clip_centr_chunks_cobjects, centr_n
def annotate(self): logger.info('Annotating...') clean_from_cos(self.config, self.config["storage"]["output_bucket"], self.output["formula_images"]) memory_capacity_mb = 2048 # TODO: Detect when this isn't enough and bump it up to 4096 process_centr_segment = create_process_segment( self.config["storage"]["ds_bucket"], self.config["storage"]["output_bucket"], self.input_data["ds_segments"], self.ds_segments_bounds, self.ds_segms_len, self.coordinates, self.image_gen_config, memory_capacity_mb, self.ds_segm_size_mb, self.imzml_parser.mzPrecision) futures = self.pywren_executor.map( process_centr_segment, f'{self.config["storage"]["db_bucket"]}/{self.input_db["centroids_segments"]}/', runtime_memory=memory_capacity_mb) formula_metrics_list, images_cloud_objs = zip( *self.pywren_executor.get_result(futures)) self.formula_metrics_df = pd.concat(formula_metrics_list) self.images_cloud_objs = list(chain(*images_cloud_objs)) append_pywren_stats(futures, memory=memory_capacity_mb, plus_objects=len(self.images_cloud_objs)) logger.info(f'Metrics calculated: {self.formula_metrics_df.shape[0]}')
def define_centr_segments(pw, clip_centr_chunks_cobjects, centr_n, ds_segm_n, ds_segm_size_mb): logger.info('Defining centroids segments bounds') def get_first_peak_mz(cobject, id, storage): print( f'Extracting first peak mz values from clipped centroids dataframe {id}' ) centr_df = read_cloud_object_with_retry(storage, cobject, pd.read_msgpack) first_peak_df = centr_df[centr_df.peak_i == 0] return first_peak_df.mz.values memory_capacity_mb = 512 futures = pw.map(get_first_peak_mz, clip_centr_chunks_cobjects, runtime_memory=memory_capacity_mb) first_peak_df_mz = np.concatenate(pw.get_result(futures)) append_pywren_stats(futures, memory_mb=memory_capacity_mb) ds_size_mb = ds_segm_n * ds_segm_size_mb data_per_centr_segm_mb = 50 peaks_per_centr_segm = 1e4 centr_segm_n = int( max(ds_size_mb // data_per_centr_segm_mb, centr_n // peaks_per_centr_segm, 32)) segm_bounds_q = [i * 1 / centr_segm_n for i in range(0, centr_segm_n)] centr_segm_lower_bounds = np.quantile(first_peak_df_mz, segm_bounds_q) logger.info( f'Generated {len(centr_segm_lower_bounds)} centroids bounds: {centr_segm_lower_bounds[0]}...{centr_segm_lower_bounds[-1]}' ) return centr_segm_lower_bounds
def clip_centr_df(pw, bucket, centr_chunks_prefix, mz_min, mz_max): def clip_centr_df_chunk(obj, storage): print(f'Clipping centroids dataframe chunk {obj.key}') centroids_df_chunk = pd.read_msgpack( obj.data_stream._raw_stream).sort_values('mz') centroids_df_chunk = centroids_df_chunk[centroids_df_chunk.mz > 0] ds_mz_range_unique_formulas = centroids_df_chunk[ (mz_min < centroids_df_chunk.mz) & (centroids_df_chunk.mz < mz_max)].index.unique() centr_df_chunk = centroids_df_chunk[centroids_df_chunk.index.isin( ds_mz_range_unique_formulas)].reset_index() clip_centr_chunk_cobject = storage.put_cobject( centr_df_chunk.to_msgpack()) return clip_centr_chunk_cobject, centr_df_chunk.shape[0] memory_capacity_mb = 512 futures = pw.map(clip_centr_df_chunk, f'cos://{bucket}/{centr_chunks_prefix}/', runtime_memory=memory_capacity_mb) clip_centr_chunks_cobjects, centr_n = list(zip(*pw.get_result(futures))) append_pywren_stats(futures, memory_mb=memory_capacity_mb, cloud_objects_n=len(futures)) clip_centr_chunks_cobjects = list(clip_centr_chunks_cobjects) centr_n = sum(centr_n) logger.info(f'Prepared {centr_n} centroids') return clip_centr_chunks_cobjects, centr_n
def calculate_centroids(pw, formula_cobjects, ds_config): polarity = ds_config['polarity'] isocalc_sigma = ds_config['isocalc_sigma'] def calculate_peaks_for_formula(formula_i, formula): mzs, ints = isocalc_wrapper.centroids(formula) if mzs is not None: return list(zip(repeat(formula_i), range(len(mzs)), mzs, ints)) else: return [] def calculate_peaks_chunk(segm_i, segm_cobject, storage): print(f'Calculating peaks from formulas chunk {segm_i}') chunk_df = deserialise( storage.get_cloudobject(segm_cobject, stream=True)) peaks = [ peak for formula_i, formula in chunk_df.items() for peak in calculate_peaks_for_formula(formula_i, formula) ] peaks_df = pd.DataFrame(peaks, columns=['formula_i', 'peak_i', 'mz', 'int']) peaks_df.set_index('formula_i', inplace=True) print(f'Storing centroids chunk {id}') peaks_cobject = storage.put_cloudobject(serialise(peaks_df)) return peaks_cobject, peaks_df.shape[0] from annotation_pipeline.isocalc_wrapper import IsocalcWrapper # Import lazily so that the rest of the pipeline still works if the dependency is missing isocalc_wrapper = IsocalcWrapper({ # These instrument settings are usually customized on a per-dataset basis out of a set of # 18 possible combinations, but most of EMBL's datasets are compatible with the following settings: 'charge': { 'polarity': polarity, 'n_charges': 1, }, 'isocalc_sigma': float(f"{isocalc_sigma:f}" ) # Rounding to match production implementation }) memory_capacity_mb = 2048 futures = pw.map(calculate_peaks_chunk, list(enumerate(formula_cobjects)), runtime_memory=memory_capacity_mb) results = pw.get_result(futures) PipelineStats.append_func(futures, memory_mb=memory_capacity_mb, cloud_objects_n=len(futures)) num_centroids = sum(count for cobj, count in results) n_centroids_chunks = len(results) peaks_cobjects = [cobj for cobj, count in results] logger.info( f'Calculated {num_centroids} centroids in {n_centroids_chunks} chunks') return peaks_cobjects
def upload_molecular_databases(self, use_cache=True): cache_key = ':db/upload_molecular_databases.cache' if use_cache and self.cacher.exists(cache_key): self.mols_dbs_cobjects = self.cacher.load(cache_key) logger.info(f'Loaded {len(self.mols_dbs_cobjects)} molecular databases from cache') else: self.mols_dbs_cobjects = upload_mol_dbs_from_dir(self.storage, self.db_config['databases']) logger.info(f'Uploaded {len(self.mols_dbs_cobjects)} molecular databases') self.cacher.save(self.mols_dbs_cobjects, cache_key)
def _upload_chunk(ch_i, sp_mz_int_buf): chunk = msgpack.dumps(sp_mz_int_buf) key = f'{input_data["ds_chunks"]}/{ch_i}.msgpack' size = sys.getsizeof(chunk) * (1 / 1024 ** 2) logger.info(f'Uploading spectra chunk {ch_i} - %.2f MB' % size) cos_client.put_object(Bucket=config["storage"]["ds_bucket"], Key=key, Body=chunk) logger.info(f'Spectra chunk {ch_i} finished') return key
def run_fdr(self): self.rankings_df = build_fdr_rankings( self.pywren_executor, self.config["storage"]["db_bucket"], self.input_data, self.input_db, self.formula_metrics_df) self.fdrs = calculate_fdrs(self.pywren_executor, self.config['storage']['ds_bucket'], self.rankings_df) logger.info(f'Number of annotations at with FDR less than:') for fdr_step in [0.05, 0.1, 0.2, 0.5]: logger.info( f'{fdr_step*100:2.0f}%: {(self.fdrs.fdr < fdr_step).sum()}')
def load_ds(self, use_cache=True): cache_key = ':ds/load_ds.cache' if self.hybrid_impl: pass # all work is done in segment_ds else: if use_cache and self.cacher.exists(cache_key): self.imzml_reader, self.imzml_reader_cobject = self.cacher.load(cache_key) logger.info(f'Loaded imzml from cache, {len(self.imzml_reader.coordinates)} spectra found') else: self.imzml_reader, self.imzml_reader_cobject = get_imzml_reader(self.lithops_executor, self.imzml_cobject) logger.info(f'Parsed imzml: {len(self.imzml_reader.coordinates)} spectra found') self.cacher.save((self.imzml_reader, self.imzml_reader_cobject), cache_key)
def segment_ds(self, use_cache=True, debug_validate=False): cache_key = ':ds/segment_ds.cache' if self.hybrid_impl: if use_cache and self.cacher.exists(cache_key): result = self.cacher.load(cache_key) logger.info(f'Loaded {len(result[2])} dataset segments from cache') else: sort_memory = 2**32 fs = self.lithops_vm_executor.call_async( load_and_split_ds_vm, (self.imzml_cobject, self.ibd_cobject, self.ds_segm_size_mb, sort_memory), ) result = self.lithops_vm_executor.get_result(fs) logger.info(f'Segmented dataset chunks into {len(result[2])} segments') self.cacher.save(result, cache_key) self.imzml_reader, \ self.ds_segments_bounds, \ self.ds_segms_cobjects, \ self.ds_segms_len, \ ds_segm_stats = result for func_name, exec_time in ds_segm_stats: if func_name == 'upload_segments': cobjs_n = len(self.ds_segms_cobjects) else: cobjs_n = 0 PipelineStats.append_vm(func_name, exec_time, cloud_objects_n=cobjs_n) else: if use_cache and self.cacher.exists(cache_key): self.ds_segments_bounds, self.ds_segms_cobjects, self.ds_segms_len = \ self.cacher.load(cache_key) logger.info(f'Loaded {len(self.ds_segms_cobjects)} dataset segments from cache') else: sample_sp_n = 1000 self.ds_segments_bounds = define_ds_segments( self.lithops_executor, self.ibd_cobject, self.imzml_reader_cobject, self.ds_segm_size_mb, sample_sp_n, ) self.ds_segms_cobjects, self.ds_segms_len = segment_spectra( self.lithops_executor, self.ds_chunks_cobjects, self.ds_segments_bounds, self.ds_segm_size_mb, self.imzml_reader.mzPrecision, ) logger.info(f'Segmented dataset chunks into {len(self.ds_segms_cobjects)} segments') self.cacher.save((self.ds_segments_bounds, self.ds_segms_cobjects, self.ds_segms_len), cache_key) self.ds_segm_n = len(self.ds_segms_cobjects) self.is_intensive_dataset = self.ds_segm_n * self.ds_segm_size_mb > 5000 if debug_validate: validate_ds_segments( self.lithops_executor, self.imzml_reader, self.ds_segments_bounds, self.ds_segms_cobjects, self.ds_segms_len, self.hybrid_impl, )
def split_ds(self, use_cache=True): cache_key = ':ds/split_ds.cache' if self.hybrid_impl: pass # all work is done in segment_ds else: if use_cache and self.cacher.exists(cache_key): self.ds_chunks_cobjects = self.cacher.load(cache_key) logger.info(f'Loaded {len(self.ds_chunks_cobjects)} dataset chunks from cache') else: self.ds_chunks_cobjects = chunk_spectra(self.lithops_executor, self.ibd_cobject, self.imzml_reader_cobject, self.imzml_reader) logger.info(f'Uploaded {len(self.ds_chunks_cobjects)} dataset chunks') self.cacher.save(self.ds_chunks_cobjects, cache_key)
def build_database(self, use_cache=True, debug_validate=False): if self.hybrid_impl: cache_key = ':ds/:db/build_database.cache' if use_cache and self.cacher.exists(cache_key): self.formula_cobjects, self.db_data_cobjects = self.cacher.load(cache_key) logger.info(f'Loaded {len(self.formula_cobjects)} formula segments and' f' {len(self.db_data_cobjects)} db_data objects from cache') else: futures = self.lithops_vm_executor.call_async( build_database_local, (self.db_config, self.ds_config, self.mols_dbs_cobjects) ) self.formula_cobjects, self.db_data_cobjects, build_db_exec_time = self.lithops_vm_executor.get_result(futures) PipelineStats.append_vm('build_database', build_db_exec_time, cloud_objects_n=len(self.formula_cobjects)) logger.info(f'Built {len(self.formula_cobjects)} formula segments and' f' {len(self.db_data_cobjects)} db_data objects') self.cacher.save((self.formula_cobjects, self.db_data_cobjects), cache_key) else: cache_key = ':db/build_database.cache' if use_cache and self.cacher.exists(cache_key): self.formula_cobjects, self.formula_to_id_cobjects = self.cacher.load(cache_key) logger.info(f'Loaded {len(self.formula_cobjects)} formula segments and' f' {len(self.formula_to_id_cobjects)} formula-to-id chunks from cache') else: self.formula_cobjects, self.formula_to_id_cobjects = build_database( self.lithops_executor, self.db_config, self.mols_dbs_cobjects ) logger.info(f'Built {len(self.formula_cobjects)} formula segments and' f' {len(self.formula_to_id_cobjects)} formula-to-id chunks') self.cacher.save((self.formula_cobjects, self.formula_to_id_cobjects), cache_key) if debug_validate: validate_formula_cobjects(self.storage, self.formula_cobjects)
def segment_ds(self): clean_from_cos(self.config, self.config["storage"]["ds_bucket"], self.input_data["ds_segments"]) sample_sp_n = 1000 self.ds_segments_bounds = define_ds_segments(self.imzml_parser, self.ds_segm_size_mb, sample_ratio=sample_sp_n / self.sp_n) self.ds_segm_n, self.ds_segms_len = segment_spectra( self.pywren_executor, self.config["storage"]["ds_bucket"], self.input_data["ds_chunks"], self.input_data["ds_segments"], self.ds_segments_bounds, self.ds_segm_size_mb, self.imzml_parser.mzPrecision) logger.info(f'Segmented dataset chunks into {self.ds_segm_n} segments')
def calculate_centroids(self, use_cache=True, debug_validate=False): cache_key = ':ds/:db/calculate_centroids.cache' if use_cache and self.cacher.exists(cache_key): self.peaks_cobjects = self.cacher.load(cache_key) logger.info(f'Loaded {len(self.peaks_cobjects)} centroid chunks from cache') else: self.peaks_cobjects = calculate_centroids( self.lithops_executor, self.formula_cobjects, self.ds_config ) logger.info(f'Calculated {len(self.peaks_cobjects)} centroid chunks') self.cacher.save(self.peaks_cobjects, cache_key) if debug_validate: validate_peaks_cobjects(self.lithops_executor, self.peaks_cobjects)
def calculate_centroids(config, input_db, polarity='+', isocalc_sigma=0.001238): bucket = config["storage"]["db_bucket"] formulas_chunks_prefix = input_db["formulas_chunks"] centroids_chunks_prefix = input_db["centroids_chunks"] clean_from_cos(config, bucket, centroids_chunks_prefix) def calculate_peaks_for_formula(formula_i, formula): mzs, ints = isocalc_wrapper.centroids(formula) if mzs is not None: return list(zip(repeat(formula_i), range(len(mzs)), mzs, ints)) else: return [] def calculate_peaks_chunk(obj, id, storage): print(f'Calculating peaks from formulas chunk {obj.key}') chunk_df = pd.read_msgpack(obj.data_stream._raw_stream) peaks = [peak for formula_i, formula in chunk_df.formula.items() for peak in calculate_peaks_for_formula(formula_i, formula)] peaks_df = pd.DataFrame(peaks, columns=['formula_i', 'peak_i', 'mz', 'int']) peaks_df.set_index('formula_i', inplace=True) print(f'Storing centroids chunk {id}') centroids_chunk_key = f'{centroids_chunks_prefix}/{id}.msgpack' storage.put_object(Bucket=bucket, Key=centroids_chunk_key, Body=peaks_df.to_msgpack()) return peaks_df.shape[0] from annotation_pipeline.isocalc_wrapper import IsocalcWrapper # Import lazily so that the rest of the pipeline still works if the dependency is missing isocalc_wrapper = IsocalcWrapper({ # These instrument settings are usually customized on a per-dataset basis out of a set of # 18 possible combinations, but most of EMBL's datasets are compatible with the following settings: 'charge': { 'polarity': polarity, 'n_charges': 1, }, 'isocalc_sigma': float(f"{isocalc_sigma:f}") # Rounding to match production implementation }) pw = pywren.ibm_cf_executor(config=config) memory_capacity_mb = 2048 futures = pw.map(calculate_peaks_chunk, f'cos://{bucket}/{formulas_chunks_prefix}/', runtime_memory=memory_capacity_mb) centroids_chunks_n = pw.get_result(futures) append_pywren_stats(futures, memory_mb=memory_capacity_mb, cloud_objects_n=len(futures)) num_centroids = sum(centroids_chunks_n) n_centroids_chunks = len(centroids_chunks_n) logger.info(f'Calculated {num_centroids} centroids in {n_centroids_chunks} chunks') return num_centroids, n_centroids_chunks
def split_ds(self): ds_chunks_cache_key = f'{self.cacher.prefix}/split_ds.cache' if self.cacher.exists(ds_chunks_cache_key): self.ds_chunks_cobjects = self.cacher.load(ds_chunks_cache_key) logger.info( f'Loaded {len(self.ds_chunks_cobjects)} dataset chunks from cache' ) else: self.ds_chunks_cobjects = chunk_spectra(self.pywren_executor, self.ds_config['ibd_path'], self.imzml_cobject, self.imzml_reader) logger.info( f'Uploaded {len(self.ds_chunks_cobjects)} dataset chunks') self.cacher.save(self.ds_chunks_cobjects, ds_chunks_cache_key)
def __init__(self, ds_config, db_config, use_db_cache=True, use_ds_cache=True, hybrid_impl='auto'): self.config = default_config() self.ds_config = ds_config self.db_config = db_config self.use_db_cache = use_db_cache self.use_ds_cache = use_ds_cache if hybrid_impl == 'auto': self.hybrid_impl = ( self.config['lithops']['mode'] == 'localhost' or self.config['lithops']['mode'] == 'serverless' and 'ibm_vpc' in self.config ) if self.hybrid_impl: logger.info(f'Using the Hybrid implementation') else: logger.info(f'Using the pure Serverless implementation') else: self.hybrid_impl = hybrid_impl lithops_bucket = self.config['lithops']['storage_bucket'] self.ds_bucket = self.config.get('storage', {}).get('ds_bucket', lithops_bucket) self.lithops_executor = lithops.FunctionExecutor(config=self.config, runtime_memory=2048) if self.hybrid_impl: if self.config['lithops']['mode'] == 'localhost': self.lithops_vm_executor = self.lithops_executor else: self.lithops_vm_executor = lithops.StandaloneExecutor(config=self.config) self.storage = Storage(config=self.config) cache_namespace = 'vm' if hybrid_impl else 'function' self.cacher = PipelineCacher( self.storage, lithops_bucket, cache_namespace, self.ds_config["name"], self.db_config["name"] ) if not self.use_db_cache or not self.use_ds_cache: self.cacher.clean(database=not self.use_db_cache, dataset=not self.use_ds_cache) stats_path_cache_key = ':ds/:db/stats_path.cache' if self.cacher.exists(stats_path_cache_key): self.stats_path = self.cacher.load(stats_path_cache_key) PipelineStats.path = self.stats_path logger.info(f'Using cached {self.stats_path} for statistics') else: PipelineStats.init() self.stats_path = PipelineStats.path self.cacher.save(self.stats_path, stats_path_cache_key) logger.info(f'Initialised {self.stats_path} for statistics') self.ds_segm_size_mb = 128 self.image_gen_config = { "q": 99, "do_preprocessing": False, "nlevels": 30, "ppm": 3.0 }
def load_ds(self): imzml_cache_key = f'{self.cacher.prefix}/load_ds.cache' if self.cacher.exists(imzml_cache_key): self.imzml_reader, self.imzml_cobject = self.cacher.load( imzml_cache_key) logger.info( f'Loaded imzml from cache, {len(self.imzml_reader.coordinates)} spectra found' ) else: self.imzml_reader, self.imzml_cobject = get_imzml_reader( self.pywren_executor, self.ds_config['imzml_path']) logger.info( f'Parsed imzml: {len(self.imzml_reader.coordinates)} spectra found' ) self.cacher.save((self.imzml_reader, self.imzml_cobject), imzml_cache_key)
def save_results(self, out_dir='.'): out_dir = Path(out_dir) images_dir = out_dir / 'images' images_dir.mkdir(parents=True, exist_ok=True) results_df = self.get_results() results_df.to_csv(out_dir / 'results.csv') image_sets = self.get_images(True, True) __import__('__main__').image_sets = image_sets filenames = (results_df.full_mol + '.png').to_dict() n_saved_images = 0 for formula_i, image_set in image_sets.items(): if image_set[0] is not None and formula_i in filenames: (images_dir / filenames[formula_i]).open('wb').write(image_set[0]) n_saved_images += 1 logger.info(f'Saved results.csv and {n_saved_images} images to {out_dir.resolve()}')
def define_ds_segments(imzml_parser, ds_segm_size_mb=5, sample_ratio=0.05): logger.info('Defining dataset segments bounds') spectra_sample = list(spectra_sample_gen(imzml_parser, sample_ratio=sample_ratio)) spectra_mzs = np.array([mz for sp_id, mzs, ints in spectra_sample for mz in mzs]) total_n_mz = spectra_mzs.shape[0] / sample_ratio float_prec = 4 if imzml_parser.mzPrecision == 'f' else 8 segm_arr_columns = 3 segm_n = segm_arr_columns * (total_n_mz * float_prec) // (ds_segm_size_mb * 2 ** 20) segm_n = max(1, int(segm_n)) segm_bounds_q = [i * 1 / segm_n for i in range(0, segm_n + 1)] segm_lower_bounds = [np.quantile(spectra_mzs, q) for q in segm_bounds_q] ds_segments = np.array(list(zip(segm_lower_bounds[:-1], segm_lower_bounds[1:]))) logger.info(f'Generated {len(ds_segments)} dataset bounds: {ds_segments[0]}...{ds_segments[-1]}') return ds_segments
def clean(self, database=True, dataset=True, hard=False): unique_prefixes = [] if not hard: if database: unique_prefixes.append(self.prefixes[':db']) if dataset: unique_prefixes.append(self.prefixes[':ds']) if database or dataset: unique_prefixes.append(self.prefixes[':ds/:db']) else: unique_prefixes.append(self.prefixes['']) keys = [ key for prefix in unique_prefixes for key in self.storage.list_keys(self.bucket, prefix) ] cobjects_to_clean = [] for cache_key in keys: cache_data = read_object_with_retry(self.storage, self.bucket, cache_key, deserialise) if isinstance(cache_data, tuple): for obj in cache_data: if isinstance(obj, list): if isinstance(obj[0], CloudObject): cobjects_to_clean.extend(obj) elif isinstance(obj, CloudObject): cobjects_to_clean.append(obj) elif isinstance(cache_data, list): if isinstance(cache_data[0], CloudObject): cobjects_to_clean.extend(cache_data) elif isinstance(cache_data, CloudObject): cobjects_to_clean.append(cache_data) self.storage.delete_cloudobjects(cobjects_to_clean) for prefix in unique_prefixes: keys = self.storage.list_keys(self.bucket, prefix) if keys: self.storage.delete_objects(self.bucket, keys) logger.info( f'Removed {len(keys)} objects from {self.storage.backend}://{self.bucket}/{prefix}' )
def run_fdr(self, use_cache=True): cache_key = ':ds/:db/run_fdr.cache' if use_cache and self.cacher.exists(cache_key): self.fdrs = self.cacher.load(cache_key) logger.info('Loaded fdrs from cache') else: if self.hybrid_impl: futures = self.lithops_vm_executor.call_async( calculate_fdrs_vm, (self.formula_metrics_df, self.db_data_cobjects), ) self.fdrs, fdr_exec_time = self.lithops_vm_executor.get_result(futures) PipelineStats.append_vm('calculate_fdrs', fdr_exec_time) else: rankings_df = build_fdr_rankings( self.lithops_executor, self.ds_config, self.db_config, self.mols_dbs_cobjects, self.formula_to_id_cobjects, self.formula_metrics_df ) self.fdrs = calculate_fdrs(self.lithops_executor, rankings_df) self.cacher.save(self.fdrs, cache_key) logger.info('Number of annotations at with FDR less than:') for fdr_step in [0.05, 0.1, 0.2, 0.5]: logger.info(f'{fdr_step*100:2.0f}%: {(self.fdrs.fdr < fdr_step).sum()}')