def merge_centr_df_segments(segm_i, ibm_cos): print(f'Merging segment {segm_i} clipped centroids chunks') keys = list_keys(bucket, f'{centr_segm_prefix}/chunk/{segm_i}/', ibm_cos) def _merge(key): segm_centr_df_chunk = read_object_with_retry(ibm_cos, bucket, key, pd.read_msgpack) return segm_centr_df_chunk with ThreadPoolExecutor(max_workers=128) as pool: segm = pd.concat(list(pool.map(_merge, keys))) del segm['segm_i'] clean_from_cos(None, bucket, f'{centr_segm_prefix}/chunk/{segm_i}/', ibm_cos) centr_segm_df = segment_centr_df(segm, centr_segm_lower_bounds[segm_i]) def _second_level_upload(args): segm_j, df = args base_id = sum([len(bounds) for bounds in centr_segm_lower_bounds[:segm_i]]) id = base_id + segm_j print(f'Storing centroids segment {id}') ibm_cos.put_object(Bucket=bucket, Key=f'{centr_segm_prefix}/{id}.msgpack', Body=df.to_msgpack()) with ThreadPoolExecutor(max_workers=128) as pool: pool.map(_second_level_upload, [(segm_i, df) for segm_i, df in centr_segm_df.groupby('segm_i')])
def merge_spectra_chunk_segments(segm_i, ibm_cos): print(f'Merging segment {segm_i} spectra chunks') keys = list_keys(bucket, f'{ds_segments_prefix}/chunk/{segm_i}/', ibm_cos) def _merge(key): segm_spectra_chunk = read_object_with_retry(ibm_cos, bucket, key, msgpack.load) return segm_spectra_chunk with ThreadPoolExecutor(max_workers=128) as pool: segm = list(pool.map(_merge, keys)) segm = np.concatenate(segm) # Alternative in-place sorting (slower) : # segm.view(f'{segm_dtype},{segm_dtype},{segm_dtype}').sort(order=['f1'], axis=0) segm = segm[segm[:, 1].argsort()] clean_from_cos(None, bucket, f'{ds_segments_prefix}/chunk/{segm_i}/', ibm_cos) bounds_list = ds_segments_bounds[segm_i] segms_len = [] for segm_j in range(len(bounds_list)): l, r = bounds_list[segm_j] segm_start, segm_end = np.searchsorted(segm[:, 1], (l, r)) # mz expected to be in column 1 sub_segm = segm[segm_start:segm_end] segms_len.append(len(sub_segm)) base_id = sum([len(bounds) for bounds in ds_segments_bounds[:segm_i]]) id = base_id + segm_j print(f'Storing dataset segment {id}') ibm_cos.put_object(Bucket=bucket, Key=f'{ds_segments_prefix}/{id}.msgpack', Body=msgpack.dumps(sub_segm)) return segms_len
def annotate(self): logger.info('Annotating...') clean_from_cos(self.config, self.config["storage"]["output_bucket"], self.output["formula_images"]) memory_capacity_mb = 2048 # TODO: Detect when this isn't enough and bump it up to 4096 process_centr_segment = create_process_segment( self.config["storage"]["ds_bucket"], self.config["storage"]["output_bucket"], self.input_data["ds_segments"], self.ds_segments_bounds, self.ds_segms_len, self.coordinates, self.image_gen_config, memory_capacity_mb, self.ds_segm_size_mb, self.imzml_parser.mzPrecision) futures = self.pywren_executor.map( process_centr_segment, f'{self.config["storage"]["db_bucket"]}/{self.input_db["centroids_segments"]}/', runtime_memory=memory_capacity_mb) formula_metrics_list, images_cloud_objs = zip( *self.pywren_executor.get_result(futures)) self.formula_metrics_df = pd.concat(formula_metrics_list) self.images_cloud_objs = list(chain(*images_cloud_objs)) append_pywren_stats(futures, memory=memory_capacity_mb, plus_objects=len(self.images_cloud_objs)) logger.info(f'Metrics calculated: {self.formula_metrics_df.shape[0]}')
def deduplicate_formulas_segment(segm_i, storage, clean=True): print(f'Deduplicating formulas segment {segm_i}') keys = list_keys(bucket, f'{formulas_chunks_prefix}/chunk/{segm_i}/', storage) segm = set() for key in keys: segm_formulas_chunk = pickle.loads(read_object_with_retry(storage, bucket, key)) segm.update(segm_formulas_chunk) if clean: clean_from_cos(config, bucket, f'{formulas_chunks_prefix}/chunk/{segm_i}/', storage) return segm
def segment_ds(self): clean_from_cos(self.config, self.config["storage"]["ds_bucket"], self.input_data["ds_segments"]) sample_sp_n = 1000 self.ds_segments_bounds = define_ds_segments(self.imzml_parser, self.ds_segm_size_mb, sample_ratio=sample_sp_n / self.sp_n) self.ds_segm_n, self.ds_segms_len = segment_spectra( self.pywren_executor, self.config["storage"]["ds_bucket"], self.input_data["ds_chunks"], self.input_data["ds_segments"], self.ds_segments_bounds, self.ds_segm_size_mb, self.imzml_parser.mzPrecision) logger.info(f'Segmented dataset chunks into {self.ds_segm_n} segments')
def calculate_centroids(config, input_db, polarity='+', isocalc_sigma=0.001238): bucket = config["storage"]["db_bucket"] formulas_chunks_prefix = input_db["formulas_chunks"] centroids_chunks_prefix = input_db["centroids_chunks"] clean_from_cos(config, bucket, centroids_chunks_prefix) def calculate_peaks_for_formula(formula_i, formula): mzs, ints = isocalc_wrapper.centroids(formula) if mzs is not None: return list(zip(repeat(formula_i), range(len(mzs)), mzs, ints)) else: return [] def calculate_peaks_chunk(obj, id, storage): print(f'Calculating peaks from formulas chunk {obj.key}') chunk_df = pd.read_msgpack(obj.data_stream._raw_stream) peaks = [peak for formula_i, formula in chunk_df.formula.items() for peak in calculate_peaks_for_formula(formula_i, formula)] peaks_df = pd.DataFrame(peaks, columns=['formula_i', 'peak_i', 'mz', 'int']) peaks_df.set_index('formula_i', inplace=True) print(f'Storing centroids chunk {id}') centroids_chunk_key = f'{centroids_chunks_prefix}/{id}.msgpack' storage.put_object(Bucket=bucket, Key=centroids_chunk_key, Body=peaks_df.to_msgpack()) return peaks_df.shape[0] from annotation_pipeline.isocalc_wrapper import IsocalcWrapper # Import lazily so that the rest of the pipeline still works if the dependency is missing isocalc_wrapper = IsocalcWrapper({ # These instrument settings are usually customized on a per-dataset basis out of a set of # 18 possible combinations, but most of EMBL's datasets are compatible with the following settings: 'charge': { 'polarity': polarity, 'n_charges': 1, }, 'isocalc_sigma': float(f"{isocalc_sigma:f}") # Rounding to match production implementation }) pw = pywren.ibm_cf_executor(config=config) memory_capacity_mb = 2048 futures = pw.map(calculate_peaks_chunk, f'cos://{bucket}/{formulas_chunks_prefix}/', runtime_memory=memory_capacity_mb) centroids_chunks_n = pw.get_result(futures) append_pywren_stats(futures, memory_mb=memory_capacity_mb, cloud_objects_n=len(futures)) num_centroids = sum(centroids_chunks_n) n_centroids_chunks = len(centroids_chunks_n) logger.info(f'Calculated {num_centroids} centroids in {n_centroids_chunks} chunks') return num_centroids, n_centroids_chunks
def clean(self): keys = list_keys(self.bucket, self.prefix, self.storage_handler) cobjects_to_clean = [] for cache_key in keys: cache_data = self.load(cache_key) if isinstance(cache_data, tuple): for obj in cache_data: if isinstance(obj, list): if isinstance(obj[0], CloudObject): cobjects_to_clean.extend(obj) elif isinstance(obj, CloudObject): cobjects_to_clean.append(obj) elif isinstance(cache_data, list): if isinstance(cache_data[0], CloudObject): cobjects_to_clean.extend(cache_data) elif isinstance(cache_data, CloudObject): cobjects_to_clean.append(cache_data) self.pywren_executor.clean(cs=cobjects_to_clean) clean_from_cos(self.config, self.bucket, self.prefix, self.storage_handler)
def build_database_local(storage, config, input_db): bucket = config["storage"]["db_bucket"] formulas_chunks_prefix = input_db["formulas_chunks"] formula_to_id_chunks_prefix = input_db["formula_to_id_chunks"] clean_from_cos(config, bucket, formulas_chunks_prefix) clean_from_cos(config, bucket, formula_to_id_chunks_prefix) formulas_df = get_formulas_df(storage, bucket, input_db) num_formulas = len(formulas_df) logger.info(f'Generated {num_formulas} formulas') n_formulas_chunks = store_formula_segments(storage, bucket, formulas_chunks_prefix, formulas_df) logger.info( f'Stored {num_formulas} formulas in {n_formulas_chunks} chunks') n_formula_to_id = store_formula_to_id(storage, bucket, formula_to_id_chunks_prefix, formulas_df) logger.info(f'Built {n_formula_to_id} formula_to_id dictionaries chunks') return num_formulas, n_formulas_chunks
def segment_centroids(self): mz_min, mz_max = self.ds_segments_bounds[ 0, 0], self.ds_segments_bounds[-1, 1] clean_from_cos(self.config, self.config["storage"]["db_bucket"], self.input_db["clipped_centroids_chunks"]) self.centr_n = clip_centr_df(self.pywren_executor, self.config["storage"]["db_bucket"], self.input_db["centroids_chunks"], self.input_db["clipped_centroids_chunks"], mz_min, mz_max) clean_from_cos(self.config, self.config["storage"]["db_bucket"], self.input_db["centroids_segments"]) self.centr_segm_lower_bounds = define_centr_segments( self.pywren_executor, self.config["storage"]["db_bucket"], self.input_db["clipped_centroids_chunks"], self.centr_n, self.ds_segm_n, self.ds_segm_size_mb) self.centr_segm_n = segment_centroids( self.pywren_executor, self.config["storage"]["db_bucket"], self.input_db["clipped_centroids_chunks"], self.input_db["centroids_segments"], self.centr_segm_lower_bounds) logger.info( f'Segmented centroids chunks into {self.centr_segm_n} segments')
def build_database(config, input_db): bucket = config["storage"]["db_bucket"] formulas_chunks_prefix = input_db["formulas_chunks"] clean_from_cos(config, bucket, formulas_chunks_prefix) adducts = [*input_db['adducts'], *DECOY_ADDUCTS] modifiers = input_db['modifiers'] databases = input_db['databases'] N_HASH_SEGMENTS = 32 # should be less than N_FORMULAS_SEGMENTS def hash_formula_to_segment(formula): m = hashlib.md5() m.update(formula.encode('utf-8')) return int(m.hexdigest(), 16) % N_HASH_SEGMENTS def generate_formulas(adduct, ibm_cos): print(f'Generating formulas for adduct {adduct}') def _get_mols(mols_key): return pickle.loads( read_object_with_retry(ibm_cos, bucket, mols_key)) with ThreadPoolExecutor(max_workers=128) as pool: mols_list = list(pool.map(_get_mols, databases)) formulas = set() for mols in mols_list: for modifier in modifiers: formulas.update( map(safe_generate_ion_formula, mols, repeat(modifier), repeat(adduct))) if None in formulas: formulas.remove(None) formulas_segments = {} for formula in formulas: segm_i = hash_formula_to_segment(formula) if segm_i in formulas_segments: formulas_segments[segm_i].append(formula) else: formulas_segments[segm_i] = [formula] def _store(segm_i): ibm_cos.put_object( Bucket=bucket, Key=f'{formulas_chunks_prefix}/chunk/{segm_i}/{adduct}.pickle', Body=pickle.dumps(formulas_segments[segm_i])) segments_n = [segm_i for segm_i in formulas_segments] with ThreadPoolExecutor(max_workers=128) as pool: pool.map(_store, segments_n) return segments_n pw = pywren.ibm_cf_executor(config=config) memory_capacity_mb = 512 futures = pw.map(generate_formulas, adducts, runtime_memory=memory_capacity_mb) segments_n = list(set().union(*pw.get_result(futures))) append_pywren_stats(futures, memory=memory_capacity_mb, plus_objects=len(adducts) * len(segments_n)) def deduplicate_formulas_segment(segm_i, ibm_cos, clean=True): print(f'Deduplicating formulas segment {segm_i}') keys = list_keys(bucket, f'{formulas_chunks_prefix}/chunk/{segm_i}/', ibm_cos) segm = set() for key in keys: segm_formulas_chunk = pickle.loads( read_object_with_retry(ibm_cos, bucket, key)) segm.update(segm_formulas_chunk) if clean: clean_from_cos(config, bucket, f'{formulas_chunks_prefix}/chunk/{segm_i}/', ibm_cos) return segm def get_formulas_number_per_chunk(segm_i, ibm_cos): segm = deduplicate_formulas_segment(segm_i, ibm_cos, clean=False) return len(segm) pw = pywren.ibm_cf_executor(config=config) memory_capacity_mb = 512 futures = pw.map(get_formulas_number_per_chunk, segments_n, runtime_memory=memory_capacity_mb) formulas_nums = pw.get_result(futures) append_pywren_stats(futures, memory=memory_capacity_mb) def store_formulas_segment(segm_i, ibm_cos): segm = deduplicate_formulas_segment(segm_i, ibm_cos) formula_i_start = sum(formulas_nums[:segm_i]) formula_i_end = formula_i_start + len(segm) segm = pd.DataFrame(sorted(segm), columns=['formula'], index=pd.RangeIndex(formula_i_start, formula_i_end, name='formula_i')) ibm_cos.put_object( Bucket=bucket, Key=f'{formulas_chunks_prefix}_fdr/{segm_i}.msgpack', Body=segm.to_msgpack()) n_threads = N_FORMULAS_SEGMENTS // N_HASH_SEGMENTS subsegm_size = math.ceil(len(segm) / n_threads) segm_list = [ segm[i:i + subsegm_size] for i in range(0, segm.shape[0], subsegm_size) ] def _store(segm_j): id = segm_i * n_threads + segm_j print(f'Storing formulas segment {id}') ibm_cos.put_object(Bucket=bucket, Key=f'{formulas_chunks_prefix}/{id}.msgpack', Body=segm_list[segm_j].to_msgpack()) with ThreadPoolExecutor(max_workers=128) as pool: pool.map(_store, range(n_threads)) return [len(segm) for segm in segm_list] pw = pywren.ibm_cf_executor(config=config) memory_capacity_mb = 512 futures = pw.map(store_formulas_segment, segments_n, runtime_memory=memory_capacity_mb) results = pw.get_result(futures) append_pywren_stats(futures, memory=memory_capacity_mb, plus_objects=N_FORMULAS_SEGMENTS, minus_objects=len(adducts) * len(segments_n)) num_formulas = sum(formulas_nums) n_formulas_chunks = sum([len(result) for result in results]) logger.info( f'Generated {num_formulas} formulas in {n_formulas_chunks} chunks') formula_to_id_chunks_prefix = input_db["formula_to_id_chunks"] clean_from_cos(config, bucket, formula_to_id_chunks_prefix) formulas_bytes = 200 * num_formulas formula_to_id_chunk_mb = 512 N_FORMULA_TO_ID = int( math.ceil(formulas_bytes / (formula_to_id_chunk_mb * 1024**2))) def store_formula_to_id_chunk(ch_i, ibm_cos): print(f'Storing formula_to_id dictionary chunk {ch_i}') start_id = (N_FORMULAS_SEGMENTS // N_FORMULA_TO_ID) * ch_i end_id = (N_FORMULAS_SEGMENTS // N_FORMULA_TO_ID) * (ch_i + 1) keys = [ f'{formulas_chunks_prefix}/{formulas_chunk}.msgpack' for formulas_chunk in range(start_id, end_id) ] def _get(key): formula_chunk = read_object_with_retry(ibm_cos, bucket, key, pd.read_msgpack) formula_to_id_chunk = dict( zip(formula_chunk.formula, formula_chunk.index)) return formula_to_id_chunk with ThreadPoolExecutor(max_workers=128) as pool: results = list(pool.map(_get, keys)) formula_to_id = {} for chunk_dict in results: formula_to_id.update(chunk_dict) ibm_cos.put_object(Bucket=bucket, Key=f'{formula_to_id_chunks_prefix}/{ch_i}.msgpack', Body=msgpack.dumps(formula_to_id)) pw = pywren.ibm_cf_executor(config=config) safe_mb = 512 memory_capacity_mb = formula_to_id_chunk_mb * 2 + safe_mb futures = pw.map(store_formula_to_id_chunk, range(N_FORMULA_TO_ID), runtime_memory=memory_capacity_mb) pw.get_result(futures) append_pywren_stats(futures, memory=memory_capacity_mb, plus_objects=N_FORMULA_TO_ID) logger.info(f'Built {N_FORMULA_TO_ID} formula_to_id dictionaries chunks') return num_formulas, n_formulas_chunks
def split_ds(self): clean_from_cos(self.config, self.config["storage"]["ds_bucket"], self.input_data["ds_chunks"]) self.specra_chunks_keys = chunk_spectra(self.config, self.input_data, self.imzml_parser, self.coordinates)