def merge_centr_df_segments(segm_i, ibm_cos):
        print(f'Merging segment {segm_i} clipped centroids chunks')

        keys = list_keys(bucket, f'{centr_segm_prefix}/chunk/{segm_i}/', ibm_cos)

        def _merge(key):
            segm_centr_df_chunk = read_object_with_retry(ibm_cos, bucket, key, pd.read_msgpack)
            return segm_centr_df_chunk

        with ThreadPoolExecutor(max_workers=128) as pool:
            segm = pd.concat(list(pool.map(_merge, keys)))
            del segm['segm_i']

        clean_from_cos(None, bucket, f'{centr_segm_prefix}/chunk/{segm_i}/', ibm_cos)
        centr_segm_df = segment_centr_df(segm, centr_segm_lower_bounds[segm_i])

        def _second_level_upload(args):
            segm_j, df = args
            base_id = sum([len(bounds) for bounds in centr_segm_lower_bounds[:segm_i]])
            id = base_id + segm_j
            print(f'Storing centroids segment {id}')
            ibm_cos.put_object(Bucket=bucket,
                               Key=f'{centr_segm_prefix}/{id}.msgpack',
                               Body=df.to_msgpack())

        with ThreadPoolExecutor(max_workers=128) as pool:
            pool.map(_second_level_upload, [(segm_i, df) for segm_i, df in centr_segm_df.groupby('segm_i')])
    def merge_spectra_chunk_segments(segm_i, ibm_cos):
        print(f'Merging segment {segm_i} spectra chunks')

        keys = list_keys(bucket, f'{ds_segments_prefix}/chunk/{segm_i}/', ibm_cos)

        def _merge(key):
            segm_spectra_chunk = read_object_with_retry(ibm_cos, bucket, key, msgpack.load)
            return segm_spectra_chunk

        with ThreadPoolExecutor(max_workers=128) as pool:
            segm = list(pool.map(_merge, keys))

        segm = np.concatenate(segm)

        # Alternative in-place sorting (slower) :
        # segm.view(f'{segm_dtype},{segm_dtype},{segm_dtype}').sort(order=['f1'], axis=0)
        segm = segm[segm[:, 1].argsort()]

        clean_from_cos(None, bucket, f'{ds_segments_prefix}/chunk/{segm_i}/', ibm_cos)
        bounds_list = ds_segments_bounds[segm_i]

        segms_len = []
        for segm_j in range(len(bounds_list)):
            l, r = bounds_list[segm_j]
            segm_start, segm_end = np.searchsorted(segm[:, 1], (l, r))  # mz expected to be in column 1
            sub_segm = segm[segm_start:segm_end]
            segms_len.append(len(sub_segm))
            base_id = sum([len(bounds) for bounds in ds_segments_bounds[:segm_i]])
            id = base_id + segm_j
            print(f'Storing dataset segment {id}')
            ibm_cos.put_object(Bucket=bucket,
                               Key=f'{ds_segments_prefix}/{id}.msgpack',
                               Body=msgpack.dumps(sub_segm))

        return segms_len
Beispiel #3
0
    def build_ranking(group_i, ranking_i, database, modifier, adduct, id,
                      storage):
        print("Building ranking...")
        print(f'job_i: {id}')
        print(f'ranking_i: {ranking_i}')
        print(f'database: {database}')
        print(f'modifier: {modifier}')
        print(f'adduct: {adduct}')
        # For every unmodified formula in `database`, look up the MSM score for the molecule
        # that it would become after the modifier and adduct are applied
        mols = pickle.loads(read_object_with_retry(storage, bucket, database))
        if adduct is not None:
            # Target rankings use the same adduct for all molecules
            mol_formulas = list(
                map(safe_generate_ion_formula, mols, repeat(modifier),
                    repeat(adduct)))
        else:
            # Decoy rankings use a consistent random adduct for each molecule, chosen so that it doesn't overlap
            # with other decoy rankings for this molecule
            adducts = _get_random_adduct_set(len(mols), decoy_adducts,
                                             ranking_i)
            mol_formulas = list(
                map(safe_generate_ion_formula, mols, repeat(modifier),
                    adducts))

        formula_to_id = {}
        keys = list_keys(bucket, f'{input_db["formula_to_id_chunks"]}/',
                         storage)
        for key in keys:
            formula_to_id_chunk = read_object_with_retry(
                storage, bucket, key, msgpack_load_text)

            for formula in mol_formulas:
                if formula_to_id_chunk.get(formula) is not None:
                    formula_to_id[formula] = formula_to_id_chunk.get(formula)

        formula_is = [
            formula and formula_to_id.get(formula) for formula in mol_formulas
        ]
        msm = [
            formula_i and msm_lookup.get(formula_i) for formula_i in formula_is
        ]
        if adduct is not None:
            ranking_df = pd.DataFrame({
                'mol': mols,
                'msm': msm
            },
                                      index=formula_is)
            ranking_df = ranking_df[~ranking_df.msm.isna()]
        else:
            # Specific molecules don't matter in the decoy rankings, only their msm distribution
            ranking_df = pd.DataFrame({'msm': msm})
            ranking_df = ranking_df[~ranking_df.msm.isna()]

        return id, storage.put_cobject(pickle.dumps(ranking_df))
    def deduplicate_formulas_segment(segm_i, storage, clean=True):
        print(f'Deduplicating formulas segment {segm_i}')
        keys = list_keys(bucket, f'{formulas_chunks_prefix}/chunk/{segm_i}/', storage)

        segm = set()
        for key in keys:
            segm_formulas_chunk = pickle.loads(read_object_with_retry(storage, bucket, key))
            segm.update(segm_formulas_chunk)

        if clean:
            clean_from_cos(config, bucket, f'{formulas_chunks_prefix}/chunk/{segm_i}/', storage)

        return segm
    def clean(self):
        keys = list_keys(self.bucket, self.prefix, self.storage_handler)

        cobjects_to_clean = []
        for cache_key in keys:
            cache_data = self.load(cache_key)
            if isinstance(cache_data, tuple):
                for obj in cache_data:
                    if isinstance(obj, list):
                        if isinstance(obj[0], CloudObject):
                            cobjects_to_clean.extend(obj)
                    elif isinstance(obj, CloudObject):
                        cobjects_to_clean.append(obj)
            elif isinstance(cache_data, list):
                if isinstance(cache_data[0], CloudObject):
                    cobjects_to_clean.extend(cache_data)
            elif isinstance(cache_data, CloudObject):
                cobjects_to_clean.append(cache_data)

        self.pywren_executor.clean(cs=cobjects_to_clean)
        clean_from_cos(self.config, self.bucket, self.prefix,
                       self.storage_handler)