def merge_centr_df_segments(segm_i, ibm_cos):
        print(f'Merging segment {segm_i} clipped centroids chunks')

        keys = list_keys(bucket, f'{centr_segm_prefix}/chunk/{segm_i}/', ibm_cos)

        def _merge(key):
            segm_centr_df_chunk = read_object_with_retry(ibm_cos, bucket, key, pd.read_msgpack)
            return segm_centr_df_chunk

        with ThreadPoolExecutor(max_workers=128) as pool:
            segm = pd.concat(list(pool.map(_merge, keys)))
            del segm['segm_i']

        clean_from_cos(None, bucket, f'{centr_segm_prefix}/chunk/{segm_i}/', ibm_cos)
        centr_segm_df = segment_centr_df(segm, centr_segm_lower_bounds[segm_i])

        def _second_level_upload(args):
            segm_j, df = args
            base_id = sum([len(bounds) for bounds in centr_segm_lower_bounds[:segm_i]])
            id = base_id + segm_j
            print(f'Storing centroids segment {id}')
            ibm_cos.put_object(Bucket=bucket,
                               Key=f'{centr_segm_prefix}/{id}.msgpack',
                               Body=df.to_msgpack())

        with ThreadPoolExecutor(max_workers=128) as pool:
            pool.map(_second_level_upload, [(segm_i, df) for segm_i, df in centr_segm_df.groupby('segm_i')])
    def merge_spectra_chunk_segments(segm_i, ibm_cos):
        print(f'Merging segment {segm_i} spectra chunks')

        keys = list_keys(bucket, f'{ds_segments_prefix}/chunk/{segm_i}/', ibm_cos)

        def _merge(key):
            segm_spectra_chunk = read_object_with_retry(ibm_cos, bucket, key, msgpack.load)
            return segm_spectra_chunk

        with ThreadPoolExecutor(max_workers=128) as pool:
            segm = list(pool.map(_merge, keys))

        segm = np.concatenate(segm)

        # Alternative in-place sorting (slower) :
        # segm.view(f'{segm_dtype},{segm_dtype},{segm_dtype}').sort(order=['f1'], axis=0)
        segm = segm[segm[:, 1].argsort()]

        clean_from_cos(None, bucket, f'{ds_segments_prefix}/chunk/{segm_i}/', ibm_cos)
        bounds_list = ds_segments_bounds[segm_i]

        segms_len = []
        for segm_j in range(len(bounds_list)):
            l, r = bounds_list[segm_j]
            segm_start, segm_end = np.searchsorted(segm[:, 1], (l, r))  # mz expected to be in column 1
            sub_segm = segm[segm_start:segm_end]
            segms_len.append(len(sub_segm))
            base_id = sum([len(bounds) for bounds in ds_segments_bounds[:segm_i]])
            id = base_id + segm_j
            print(f'Storing dataset segment {id}')
            ibm_cos.put_object(Bucket=bucket,
                               Key=f'{ds_segments_prefix}/{id}.msgpack',
                               Body=msgpack.dumps(sub_segm))

        return segms_len
Example #3
0
    def annotate(self):
        logger.info('Annotating...')
        clean_from_cos(self.config, self.config["storage"]["output_bucket"],
                       self.output["formula_images"])

        memory_capacity_mb = 2048  # TODO: Detect when this isn't enough and bump it up to 4096
        process_centr_segment = create_process_segment(
            self.config["storage"]["ds_bucket"],
            self.config["storage"]["output_bucket"],
            self.input_data["ds_segments"], self.ds_segments_bounds,
            self.ds_segms_len, self.coordinates, self.image_gen_config,
            memory_capacity_mb, self.ds_segm_size_mb,
            self.imzml_parser.mzPrecision)

        futures = self.pywren_executor.map(
            process_centr_segment,
            f'{self.config["storage"]["db_bucket"]}/{self.input_db["centroids_segments"]}/',
            runtime_memory=memory_capacity_mb)
        formula_metrics_list, images_cloud_objs = zip(
            *self.pywren_executor.get_result(futures))
        self.formula_metrics_df = pd.concat(formula_metrics_list)
        self.images_cloud_objs = list(chain(*images_cloud_objs))
        append_pywren_stats(futures,
                            memory=memory_capacity_mb,
                            plus_objects=len(self.images_cloud_objs))

        logger.info(f'Metrics calculated: {self.formula_metrics_df.shape[0]}')
    def deduplicate_formulas_segment(segm_i, storage, clean=True):
        print(f'Deduplicating formulas segment {segm_i}')
        keys = list_keys(bucket, f'{formulas_chunks_prefix}/chunk/{segm_i}/', storage)

        segm = set()
        for key in keys:
            segm_formulas_chunk = pickle.loads(read_object_with_retry(storage, bucket, key))
            segm.update(segm_formulas_chunk)

        if clean:
            clean_from_cos(config, bucket, f'{formulas_chunks_prefix}/chunk/{segm_i}/', storage)

        return segm
Example #5
0
 def segment_ds(self):
     clean_from_cos(self.config, self.config["storage"]["ds_bucket"],
                    self.input_data["ds_segments"])
     sample_sp_n = 1000
     self.ds_segments_bounds = define_ds_segments(self.imzml_parser,
                                                  self.ds_segm_size_mb,
                                                  sample_ratio=sample_sp_n /
                                                  self.sp_n)
     self.ds_segm_n, self.ds_segms_len = segment_spectra(
         self.pywren_executor, self.config["storage"]["ds_bucket"],
         self.input_data["ds_chunks"], self.input_data["ds_segments"],
         self.ds_segments_bounds, self.ds_segm_size_mb,
         self.imzml_parser.mzPrecision)
     logger.info(f'Segmented dataset chunks into {self.ds_segm_n} segments')
def calculate_centroids(config, input_db, polarity='+', isocalc_sigma=0.001238):
    bucket = config["storage"]["db_bucket"]
    formulas_chunks_prefix = input_db["formulas_chunks"]
    centroids_chunks_prefix = input_db["centroids_chunks"]
    clean_from_cos(config, bucket, centroids_chunks_prefix)

    def calculate_peaks_for_formula(formula_i, formula):
        mzs, ints = isocalc_wrapper.centroids(formula)
        if mzs is not None:
            return list(zip(repeat(formula_i), range(len(mzs)), mzs, ints))
        else:
            return []

    def calculate_peaks_chunk(obj, id, storage):
        print(f'Calculating peaks from formulas chunk {obj.key}')
        chunk_df = pd.read_msgpack(obj.data_stream._raw_stream)
        peaks = [peak for formula_i, formula in chunk_df.formula.items()
                 for peak in calculate_peaks_for_formula(formula_i, formula)]
        peaks_df = pd.DataFrame(peaks, columns=['formula_i', 'peak_i', 'mz', 'int'])
        peaks_df.set_index('formula_i', inplace=True)

        print(f'Storing centroids chunk {id}')
        centroids_chunk_key = f'{centroids_chunks_prefix}/{id}.msgpack'
        storage.put_object(Bucket=bucket, Key=centroids_chunk_key, Body=peaks_df.to_msgpack())

        return peaks_df.shape[0]

    from annotation_pipeline.isocalc_wrapper import IsocalcWrapper # Import lazily so that the rest of the pipeline still works if the dependency is missing
    isocalc_wrapper = IsocalcWrapper({
        # These instrument settings are usually customized on a per-dataset basis out of a set of
        # 18 possible combinations, but most of EMBL's datasets are compatible with the following settings:
        'charge': {
            'polarity': polarity,
            'n_charges': 1,
        },
        'isocalc_sigma': float(f"{isocalc_sigma:f}") # Rounding to match production implementation
    })

    pw = pywren.ibm_cf_executor(config=config)
    memory_capacity_mb = 2048
    futures = pw.map(calculate_peaks_chunk, f'cos://{bucket}/{formulas_chunks_prefix}/', runtime_memory=memory_capacity_mb)
    centroids_chunks_n = pw.get_result(futures)
    append_pywren_stats(futures, memory_mb=memory_capacity_mb, cloud_objects_n=len(futures))

    num_centroids = sum(centroids_chunks_n)
    n_centroids_chunks = len(centroids_chunks_n)
    logger.info(f'Calculated {num_centroids} centroids in {n_centroids_chunks} chunks')
    return num_centroids, n_centroids_chunks
    def clean(self):
        keys = list_keys(self.bucket, self.prefix, self.storage_handler)

        cobjects_to_clean = []
        for cache_key in keys:
            cache_data = self.load(cache_key)
            if isinstance(cache_data, tuple):
                for obj in cache_data:
                    if isinstance(obj, list):
                        if isinstance(obj[0], CloudObject):
                            cobjects_to_clean.extend(obj)
                    elif isinstance(obj, CloudObject):
                        cobjects_to_clean.append(obj)
            elif isinstance(cache_data, list):
                if isinstance(cache_data[0], CloudObject):
                    cobjects_to_clean.extend(cache_data)
            elif isinstance(cache_data, CloudObject):
                cobjects_to_clean.append(cache_data)

        self.pywren_executor.clean(cs=cobjects_to_clean)
        clean_from_cos(self.config, self.bucket, self.prefix,
                       self.storage_handler)
def build_database_local(storage, config, input_db):
    bucket = config["storage"]["db_bucket"]
    formulas_chunks_prefix = input_db["formulas_chunks"]
    formula_to_id_chunks_prefix = input_db["formula_to_id_chunks"]
    clean_from_cos(config, bucket, formulas_chunks_prefix)
    clean_from_cos(config, bucket, formula_to_id_chunks_prefix)

    formulas_df = get_formulas_df(storage, bucket, input_db)
    num_formulas = len(formulas_df)
    logger.info(f'Generated {num_formulas} formulas')

    n_formulas_chunks = store_formula_segments(storage, bucket,
                                               formulas_chunks_prefix,
                                               formulas_df)
    logger.info(
        f'Stored {num_formulas} formulas in {n_formulas_chunks} chunks')

    n_formula_to_id = store_formula_to_id(storage, bucket,
                                          formula_to_id_chunks_prefix,
                                          formulas_df)
    logger.info(f'Built {n_formula_to_id} formula_to_id dictionaries chunks')

    return num_formulas, n_formulas_chunks
Example #9
0
    def segment_centroids(self):
        mz_min, mz_max = self.ds_segments_bounds[
            0, 0], self.ds_segments_bounds[-1, 1]

        clean_from_cos(self.config, self.config["storage"]["db_bucket"],
                       self.input_db["clipped_centroids_chunks"])
        self.centr_n = clip_centr_df(self.pywren_executor,
                                     self.config["storage"]["db_bucket"],
                                     self.input_db["centroids_chunks"],
                                     self.input_db["clipped_centroids_chunks"],
                                     mz_min, mz_max)

        clean_from_cos(self.config, self.config["storage"]["db_bucket"],
                       self.input_db["centroids_segments"])
        self.centr_segm_lower_bounds = define_centr_segments(
            self.pywren_executor, self.config["storage"]["db_bucket"],
            self.input_db["clipped_centroids_chunks"], self.centr_n,
            self.ds_segm_n, self.ds_segm_size_mb)
        self.centr_segm_n = segment_centroids(
            self.pywren_executor, self.config["storage"]["db_bucket"],
            self.input_db["clipped_centroids_chunks"],
            self.input_db["centroids_segments"], self.centr_segm_lower_bounds)
        logger.info(
            f'Segmented centroids chunks into {self.centr_segm_n} segments')
Example #10
0
def build_database(config, input_db):
    bucket = config["storage"]["db_bucket"]
    formulas_chunks_prefix = input_db["formulas_chunks"]
    clean_from_cos(config, bucket, formulas_chunks_prefix)

    adducts = [*input_db['adducts'], *DECOY_ADDUCTS]
    modifiers = input_db['modifiers']
    databases = input_db['databases']

    N_HASH_SEGMENTS = 32  # should be less than N_FORMULAS_SEGMENTS

    def hash_formula_to_segment(formula):
        m = hashlib.md5()
        m.update(formula.encode('utf-8'))
        return int(m.hexdigest(), 16) % N_HASH_SEGMENTS

    def generate_formulas(adduct, ibm_cos):
        print(f'Generating formulas for adduct {adduct}')

        def _get_mols(mols_key):
            return pickle.loads(
                read_object_with_retry(ibm_cos, bucket, mols_key))

        with ThreadPoolExecutor(max_workers=128) as pool:
            mols_list = list(pool.map(_get_mols, databases))

        formulas = set()

        for mols in mols_list:
            for modifier in modifiers:
                formulas.update(
                    map(safe_generate_ion_formula, mols, repeat(modifier),
                        repeat(adduct)))

        if None in formulas:
            formulas.remove(None)

        formulas_segments = {}
        for formula in formulas:
            segm_i = hash_formula_to_segment(formula)
            if segm_i in formulas_segments:
                formulas_segments[segm_i].append(formula)
            else:
                formulas_segments[segm_i] = [formula]

        def _store(segm_i):
            ibm_cos.put_object(
                Bucket=bucket,
                Key=f'{formulas_chunks_prefix}/chunk/{segm_i}/{adduct}.pickle',
                Body=pickle.dumps(formulas_segments[segm_i]))

        segments_n = [segm_i for segm_i in formulas_segments]
        with ThreadPoolExecutor(max_workers=128) as pool:
            pool.map(_store, segments_n)

        return segments_n

    pw = pywren.ibm_cf_executor(config=config)
    memory_capacity_mb = 512
    futures = pw.map(generate_formulas,
                     adducts,
                     runtime_memory=memory_capacity_mb)
    segments_n = list(set().union(*pw.get_result(futures)))
    append_pywren_stats(futures,
                        memory=memory_capacity_mb,
                        plus_objects=len(adducts) * len(segments_n))

    def deduplicate_formulas_segment(segm_i, ibm_cos, clean=True):
        print(f'Deduplicating formulas segment {segm_i}')
        keys = list_keys(bucket, f'{formulas_chunks_prefix}/chunk/{segm_i}/',
                         ibm_cos)

        segm = set()
        for key in keys:
            segm_formulas_chunk = pickle.loads(
                read_object_with_retry(ibm_cos, bucket, key))
            segm.update(segm_formulas_chunk)

        if clean:
            clean_from_cos(config, bucket,
                           f'{formulas_chunks_prefix}/chunk/{segm_i}/',
                           ibm_cos)

        return segm

    def get_formulas_number_per_chunk(segm_i, ibm_cos):
        segm = deduplicate_formulas_segment(segm_i, ibm_cos, clean=False)
        return len(segm)

    pw = pywren.ibm_cf_executor(config=config)
    memory_capacity_mb = 512
    futures = pw.map(get_formulas_number_per_chunk,
                     segments_n,
                     runtime_memory=memory_capacity_mb)
    formulas_nums = pw.get_result(futures)
    append_pywren_stats(futures, memory=memory_capacity_mb)

    def store_formulas_segment(segm_i, ibm_cos):
        segm = deduplicate_formulas_segment(segm_i, ibm_cos)
        formula_i_start = sum(formulas_nums[:segm_i])
        formula_i_end = formula_i_start + len(segm)
        segm = pd.DataFrame(sorted(segm),
                            columns=['formula'],
                            index=pd.RangeIndex(formula_i_start,
                                                formula_i_end,
                                                name='formula_i'))

        ibm_cos.put_object(
            Bucket=bucket,
            Key=f'{formulas_chunks_prefix}_fdr/{segm_i}.msgpack',
            Body=segm.to_msgpack())

        n_threads = N_FORMULAS_SEGMENTS // N_HASH_SEGMENTS
        subsegm_size = math.ceil(len(segm) / n_threads)
        segm_list = [
            segm[i:i + subsegm_size]
            for i in range(0, segm.shape[0], subsegm_size)
        ]

        def _store(segm_j):
            id = segm_i * n_threads + segm_j
            print(f'Storing formulas segment {id}')
            ibm_cos.put_object(Bucket=bucket,
                               Key=f'{formulas_chunks_prefix}/{id}.msgpack',
                               Body=segm_list[segm_j].to_msgpack())

        with ThreadPoolExecutor(max_workers=128) as pool:
            pool.map(_store, range(n_threads))

        return [len(segm) for segm in segm_list]

    pw = pywren.ibm_cf_executor(config=config)
    memory_capacity_mb = 512
    futures = pw.map(store_formulas_segment,
                     segments_n,
                     runtime_memory=memory_capacity_mb)
    results = pw.get_result(futures)
    append_pywren_stats(futures,
                        memory=memory_capacity_mb,
                        plus_objects=N_FORMULAS_SEGMENTS,
                        minus_objects=len(adducts) * len(segments_n))

    num_formulas = sum(formulas_nums)
    n_formulas_chunks = sum([len(result) for result in results])
    logger.info(
        f'Generated {num_formulas} formulas in {n_formulas_chunks} chunks')

    formula_to_id_chunks_prefix = input_db["formula_to_id_chunks"]
    clean_from_cos(config, bucket, formula_to_id_chunks_prefix)
    formulas_bytes = 200 * num_formulas
    formula_to_id_chunk_mb = 512
    N_FORMULA_TO_ID = int(
        math.ceil(formulas_bytes / (formula_to_id_chunk_mb * 1024**2)))

    def store_formula_to_id_chunk(ch_i, ibm_cos):
        print(f'Storing formula_to_id dictionary chunk {ch_i}')
        start_id = (N_FORMULAS_SEGMENTS // N_FORMULA_TO_ID) * ch_i
        end_id = (N_FORMULAS_SEGMENTS // N_FORMULA_TO_ID) * (ch_i + 1)
        keys = [
            f'{formulas_chunks_prefix}/{formulas_chunk}.msgpack'
            for formulas_chunk in range(start_id, end_id)
        ]

        def _get(key):
            formula_chunk = read_object_with_retry(ibm_cos, bucket, key,
                                                   pd.read_msgpack)
            formula_to_id_chunk = dict(
                zip(formula_chunk.formula, formula_chunk.index))
            return formula_to_id_chunk

        with ThreadPoolExecutor(max_workers=128) as pool:
            results = list(pool.map(_get, keys))

        formula_to_id = {}
        for chunk_dict in results:
            formula_to_id.update(chunk_dict)

        ibm_cos.put_object(Bucket=bucket,
                           Key=f'{formula_to_id_chunks_prefix}/{ch_i}.msgpack',
                           Body=msgpack.dumps(formula_to_id))

    pw = pywren.ibm_cf_executor(config=config)
    safe_mb = 512
    memory_capacity_mb = formula_to_id_chunk_mb * 2 + safe_mb
    futures = pw.map(store_formula_to_id_chunk,
                     range(N_FORMULA_TO_ID),
                     runtime_memory=memory_capacity_mb)
    pw.get_result(futures)
    append_pywren_stats(futures,
                        memory=memory_capacity_mb,
                        plus_objects=N_FORMULA_TO_ID)
    logger.info(f'Built {N_FORMULA_TO_ID} formula_to_id dictionaries chunks')

    return num_formulas, n_formulas_chunks
Example #11
0
 def split_ds(self):
     clean_from_cos(self.config, self.config["storage"]["ds_bucket"],
                    self.input_data["ds_chunks"])
     self.specra_chunks_keys = chunk_spectra(self.config, self.input_data,
                                             self.imzml_parser,
                                             self.coordinates)