Example #1
0
    def build_ranking(group_i, ranking_i, database, modifier, adduct, id,
                      storage):
        print("Building ranking...")
        print(f'job_i: {id}')
        print(f'ranking_i: {ranking_i}')
        print(f'database: {database}')
        print(f'modifier: {modifier}')
        print(f'adduct: {adduct}')
        # For every unmodified formula in `database`, look up the MSM score for the molecule
        # that it would become after the modifier and adduct are applied
        mols = pickle.loads(read_object_with_retry(storage, bucket, database))
        if adduct is not None:
            # Target rankings use the same adduct for all molecules
            mol_formulas = list(
                map(safe_generate_ion_formula, mols, repeat(modifier),
                    repeat(adduct)))
        else:
            # Decoy rankings use a consistent random adduct for each molecule, chosen so that it doesn't overlap
            # with other decoy rankings for this molecule
            adducts = _get_random_adduct_set(len(mols), decoy_adducts,
                                             ranking_i)
            mol_formulas = list(
                map(safe_generate_ion_formula, mols, repeat(modifier),
                    adducts))

        formula_to_id = {}
        keys = list_keys(bucket, f'{input_db["formula_to_id_chunks"]}/',
                         storage)
        for key in keys:
            formula_to_id_chunk = read_object_with_retry(
                storage, bucket, key, msgpack_load_text)

            for formula in mol_formulas:
                if formula_to_id_chunk.get(formula) is not None:
                    formula_to_id[formula] = formula_to_id_chunk.get(formula)

        formula_is = [
            formula and formula_to_id.get(formula) for formula in mol_formulas
        ]
        msm = [
            formula_i and msm_lookup.get(formula_i) for formula_i in formula_is
        ]
        if adduct is not None:
            ranking_df = pd.DataFrame({
                'mol': mols,
                'msm': msm
            },
                                      index=formula_is)
            ranking_df = ranking_df[~ranking_df.msm.isna()]
        else:
            # Specific molecules don't matter in the decoy rankings, only their msm distribution
            ranking_df = pd.DataFrame({'msm': msm})
            ranking_df = ranking_df[~ranking_df.msm.isna()]

        return id, storage.put_cobject(pickle.dumps(ranking_df))
Example #2
0
 def run_ranking(ibm_cos, data_bucket, target_key, decoy_key):
     target = pickle.loads(read_object_with_retry(ibm_cos, data_bucket, target_key))
     decoy = pickle.loads(read_object_with_retry(ibm_cos, data_bucket, decoy_key))
     merged = pd.concat([target.assign(is_target=1), decoy.assign(is_target=0)], sort=False)
     merged = merged.sort_values('msm', ascending=False)
     decoy_cumsum = (merged.is_target == False).cumsum()
     target_cumsum = merged.is_target.cumsum()
     base_fdr = np.clip(decoy_cumsum / target_cumsum, 0, 1)
     base_fdr[np.isnan(base_fdr)] = 1
     target_fdrs = merged.assign(fdr=base_fdr)[lambda df: df.is_target == 1]
     target_fdrs = target_fdrs.drop('is_target', axis=1)
     target_fdrs = target_fdrs.sort_values('msm')
     target_fdrs = target_fdrs.assign(fdr=np.minimum.accumulate(target_fdrs.fdr))
     target_fdrs = target_fdrs.sort_index()
     return target_fdrs
    def process_centr_segment(obj, ibm_cos, internal_storage):
        print(f'Reading centroids segment {obj.key}')
        # read database relevant part
        try:
            centr_df = pd.read_msgpack(obj.data_stream)
        except:
            centr_df = read_object_with_retry(ibm_cos, obj.bucket, obj.key, pd.read_msgpack)

        # find range of datasets
        first_ds_segm_i, last_ds_segm_i = choose_ds_segments(ds_segments_bounds, centr_df, ppm)
        print(f'Reading dataset segments {first_ds_segm_i}-{last_ds_segm_i}')
        # read all segments in loop from COS
        sp_arr = read_ds_segments(ds_bucket, ds_segm_prefix, first_ds_segm_i, last_ds_segm_i,
                                  ds_segms_len[first_ds_segm_i:last_ds_segm_i+1], pw_mem_mb,
                                  ds_segm_size_mb, ds_segm_dtype, ibm_cos)

        formula_images_it = gen_iso_images(sp_inds=sp_arr[:,0], sp_mzs=sp_arr[:,1], sp_ints=sp_arr[:,2],
                                           centr_df=centr_df, nrows=nrows, ncols=ncols, ppm=ppm, min_px=1)
        safe_mb = 1024
        max_formula_images_mb = (pw_mem_mb - safe_mb - (last_ds_segm_i - first_ds_segm_i + 1) * ds_segm_size_mb) // 3
        print(f'Max formula_images size: {max_formula_images_mb} mb')
        images_manager = ImagesManager(internal_storage, output_bucket, max_formula_images_mb * 1024 ** 2)
        formula_image_metrics(formula_images_it, compute_metrics, images_manager)
        images_cloud_objs = images_manager.finish()

        print(f'Centroids segment {obj.key} finished')
        formula_metrics_df = pd.DataFrame.from_dict(images_manager.formula_metrics, orient='index')
        formula_metrics_df.index.name = 'formula_i'
        return formula_metrics_df, images_cloud_objs
 def read_ds_segment(ds_segm_key):
     data = read_object_with_retry(ibm_cos, ds_bucket, ds_segm_key, msgpack.load)
     if type(data) == list:
         sp_arr = np.concatenate(data)
     else:
         sp_arr = data
     return sp_arr
    def deduplicate_formulas_segment(segm_i, storage, clean=True):
        print(f'Deduplicating formulas segment {segm_i}')
        keys = list_keys(bucket, f'{formulas_chunks_prefix}/chunk/{segm_i}/', storage)

        segm = set()
        for key in keys:
            segm_formulas_chunk = pickle.loads(read_object_with_retry(storage, bucket, key))
            segm.update(segm_formulas_chunk)

        if clean:
            clean_from_cos(config, bucket, f'{formulas_chunks_prefix}/chunk/{segm_i}/', storage)

        return segm
Example #6
0
    def clean(self, database=True, dataset=True, hard=False):
        unique_prefixes = []
        if not hard:
            if database:
                unique_prefixes.append(self.prefixes[':db'])
            if dataset:
                unique_prefixes.append(self.prefixes[':ds'])
            if database or dataset:
                unique_prefixes.append(self.prefixes[':ds/:db'])
        else:
            unique_prefixes.append(self.prefixes[''])

        keys = [
            key for prefix in unique_prefixes
            for key in self.storage.list_keys(self.bucket, prefix)
        ]

        cobjects_to_clean = []
        for cache_key in keys:
            cache_data = read_object_with_retry(self.storage, self.bucket,
                                                cache_key, deserialise)

            if isinstance(cache_data, tuple):
                for obj in cache_data:
                    if isinstance(obj, list):
                        if isinstance(obj[0], CloudObject):
                            cobjects_to_clean.extend(obj)
                    elif isinstance(obj, CloudObject):
                        cobjects_to_clean.append(obj)
            elif isinstance(cache_data, list):
                if isinstance(cache_data[0], CloudObject):
                    cobjects_to_clean.extend(cache_data)
            elif isinstance(cache_data, CloudObject):
                cobjects_to_clean.append(cache_data)

        self.storage.delete_cloudobjects(cobjects_to_clean)
        for prefix in unique_prefixes:
            keys = self.storage.list_keys(self.bucket, prefix)
            if keys:
                self.storage.delete_objects(self.bucket, keys)
                logger.info(
                    f'Removed {len(keys)} objects from {self.storage.backend}://{self.bucket}/{prefix}'
                )
Example #7
0
 def _get_mols(mols_key):
     return pickle.loads(
         read_object_with_retry(ibm_cos, bucket, mols_key))
Example #8
0
 def _get(key):
     formula_chunk = read_object_with_retry(ibm_cos, bucket, key,
                                            pd.read_msgpack)
     formula_to_id_chunk = dict(
         zip(formula_chunk.formula, formula_chunk.index))
     return formula_to_id_chunk
 def _get_mols(mols_key):
     return pickle.loads(read_object_with_retry(storage, bucket, mols_key))
 def _merge(key):
     segm_centr_df_chunk = read_object_with_retry(ibm_cos, bucket, key, pd.read_msgpack)
     return segm_centr_df_chunk
 def _merge(key):
     segm_spectra_chunk = read_object_with_retry(ibm_cos, bucket, key, msgpack.load)
     return segm_spectra_chunk