예제 #1
0
    def upload_chunk(ch_i, storage):
        chunk_sp_inds = chunks[ch_i]
        # Get imzml_reader from COS because it's too big to include via Lithops captured vars
        imzml_reader = read_cloud_object_with_retry(storage,
                                                    imzml_reader_cobject,
                                                    deserialise)
        n_spectra = sum(imzml_reader.mzLengths[sp_i] for sp_i in chunk_sp_inds)
        sp_mz_int_buf = np.zeros((n_spectra, 3),
                                 dtype=imzml_reader.mzPrecision)

        chunk_start = 0
        for sp_i, mzs, ints in get_spectra(storage, ibd_cobject, imzml_reader,
                                           chunk_sp_inds):
            chunk_end = chunk_start + len(mzs)
            sp_mz_int_buf[chunk_start:chunk_end, 0] = sp_id_to_idx[sp_i]
            sp_mz_int_buf[chunk_start:chunk_end, 1] = mzs
            sp_mz_int_buf[chunk_start:chunk_end, 2] = ints
            chunk_start = chunk_end

        by_mz = np.argsort(sp_mz_int_buf[:, 1])
        sp_mz_int_buf = sp_mz_int_buf[by_mz]
        del by_mz

        chunk = serialise(sp_mz_int_buf)
        size = sys.getsizeof(chunk) * (1 / 1024**2)
        logger.info(f'Uploading spectra chunk {ch_i} - %.2f MB' % size)
        chunk_cobject = storage.put_cloudobject(chunk)
        logger.info(f'Spectra chunk {ch_i} finished')
        return chunk_cobject
예제 #2
0
 def _first_level_segment_upload(segm_i):
     l = ds_segments_bounds[segm_i][0, 0]
     r = ds_segments_bounds[segm_i][-1, 1]
     segm_start, segm_end = np.searchsorted(
         sp_mz_int_buf[:, 1], (l, r))  # mz expected to be in column 1
     segm = sp_mz_int_buf[segm_start:segm_end]
     return storage.put_cloudobject(serialise(segm))
예제 #3
0
    def merge_spectra_chunk_segments(segm_cobjects, id, storage):
        print(f'Merging segment {id} spectra chunks')

        def _merge(ch_i):
            segm_spectra_chunk = read_cloud_object_with_retry(
                storage, segm_cobjects[ch_i], deserialise)
            return segm_spectra_chunk

        with ThreadPoolExecutor(max_workers=128) as pool:
            segm = list(pool.map(_merge, range(len(segm_cobjects))))

        segm = np.concatenate(segm)

        # Alternative in-place sorting (slower) :
        # segm.view(f'{ds_segm_dtype},{ds_segm_dtype},{ds_segm_dtype}').sort(order=['f1'], axis=0)
        segm = segm[segm[:, 1].argsort()]

        bounds_list = ds_segments_bounds[id]

        segms_len = []
        segms_cobjects = []
        for segm_j in range(len(bounds_list)):
            l, r = bounds_list[segm_j]
            segm_start, segm_end = np.searchsorted(
                segm[:, 1], (l, r))  # mz expected to be in column 1
            sub_segm = segm[segm_start:segm_end]
            segms_len.append(len(sub_segm))
            base_id = sum([len(bounds) for bounds in ds_segments_bounds[:id]])
            segm_i = base_id + segm_j
            print(f'Storing dataset segment {segm_i}')
            segms_cobjects.append(storage.put_cloudobject(serialise(sub_segm)))

        return segms_len, segms_cobjects
예제 #4
0
 def save_images(self):
     if self.formula_images:
         print(f'Saving {len(self.formula_images)} images')
         cloud_obj = self._storage.put_cloudobject(
             serialise(self.formula_images))
         self.cloud_objs.append(cloud_obj)
         self._partition += 1
     else:
         print(f'No images to save')
예제 #5
0
    def build_ranking(group_i, ranking_i, database, modifier, adduct, id,
                      storage):
        print("Building ranking...")
        print(f'job_i: {id}')
        print(f'ranking_i: {ranking_i}')
        print(f'database: {database}')
        print(f'modifier: {modifier}')
        print(f'adduct: {adduct}')
        # For every unmodified formula in `database`, look up the MSM score for the molecule
        # that it would become after the modifier and adduct are applied
        mols = read_cloud_object_with_retry(storage,
                                            mol_db_path_to_cobj[database],
                                            deserialise)
        if adduct is not None:
            # Target rankings use the same adduct for all molecules
            mol_formulas = list(
                map(safe_generate_ion_formula, mols, repeat(modifier),
                    repeat(adduct)))
        else:
            # Decoy rankings use a consistent random adduct for each molecule, chosen so that it doesn't overlap
            # with other decoy rankings for this molecule
            adducts = _get_random_adduct_set(len(mols), decoy_adducts,
                                             ranking_i)
            mol_formulas = list(
                map(safe_generate_ion_formula, mols, repeat(modifier),
                    adducts))

        formula_to_id = {}
        for cobject in formula_to_id_cobjects:
            formula_to_id_chunk = read_cloud_object_with_retry(
                storage, cobject, deserialise)

            for formula in mol_formulas:
                if formula_to_id_chunk.get(formula) is not None:
                    formula_to_id[formula] = formula_to_id_chunk.get(formula)

        formula_is = [
            formula and formula_to_id.get(formula) for formula in mol_formulas
        ]
        msm = [
            formula_i and msm_lookup.get(formula_i) for formula_i in formula_is
        ]
        if adduct is not None:
            ranking_df = pd.DataFrame({
                'mol': mols,
                'msm': msm
            },
                                      index=formula_is)
            ranking_df = ranking_df[~ranking_df.msm.isna()]
        else:
            # Specific molecules don't matter in the decoy rankings, only their msm distribution
            ranking_df = pd.DataFrame({'msm': msm})
            ranking_df = ranking_df[~ranking_df.msm.isna()]

        return id, storage.put_cloudobject(serialise(ranking_df))
 def _upload(segm_i):
     segm = pd.concat([
         deserialise_from_file(
             ds_segments_path / f'ds_segm_{segm_i:04}_{chunk_i:04}')
         for chunk_i in range(chunks_n)
     ],
                      ignore_index=True,
                      sort=False)
     segm.sort_values('mz', inplace=True)
     segm.reset_index(drop=True, inplace=True)
     segm = serialise(segm)
     logger.debug(
         f'Uploading segment {segm_i}: {segm.getbuffer().nbytes} bytes')
     return storage.put_cloudobject(segm)
예제 #7
0
    def clip_centr_df_chunk(peaks_i, peaks_cobject, storage):
        print(f'Clipping centroids dataframe chunk {peaks_i}')
        centroids_df_chunk = deserialise(
            storage.get_cloudobject(peaks_cobject,
                                    stream=True)).sort_values('mz')
        centroids_df_chunk = centroids_df_chunk[centroids_df_chunk.mz > 0]

        ds_mz_range_unique_formulas = centroids_df_chunk[
            (mz_min < centroids_df_chunk.mz)
            & (centroids_df_chunk.mz < mz_max)].index.unique()
        centr_df_chunk = centroids_df_chunk[centroids_df_chunk.index.isin(
            ds_mz_range_unique_formulas)].reset_index()
        clip_centr_chunk_cobject = storage.put_cloudobject(
            serialise(centr_df_chunk))

        return clip_centr_chunk_cobject, centr_df_chunk.shape[0]
예제 #8
0
    def calculate_peaks_chunk(segm_i, segm_cobject, storage):
        print(f'Calculating peaks from formulas chunk {segm_i}')
        chunk_df = deserialise(
            storage.get_cloudobject(segm_cobject, stream=True))
        peaks = [
            peak for formula_i, formula in chunk_df.items()
            for peak in calculate_peaks_for_formula(formula_i, formula)
        ]
        peaks_df = pd.DataFrame(peaks,
                                columns=['formula_i', 'peak_i', 'mz', 'int'])
        peaks_df.set_index('formula_i', inplace=True)

        print(f'Storing centroids chunk {id}')
        peaks_cobject = storage.put_cloudobject(serialise(peaks_df))

        return peaks_cobject, peaks_df.shape[0]
예제 #9
0
    def store_formula_to_id_chunk(ch_i, input_cobjects, storage):
        print(f'Storing formula_to_id dictionary chunk {ch_i}')

        def _get(cobj):
            formula_chunk = read_cloud_object_with_retry(
                storage, cobj, deserialise)
            formula_to_id_chunk = dict(
                zip(formula_chunk.values, formula_chunk.index))
            return formula_to_id_chunk

        formula_to_id = {}
        with ThreadPoolExecutor(max_workers=128) as pool:
            for chunk_dict in pool.map(_get, input_cobjects):
                formula_to_id.update(chunk_dict)

        return storage.put_cloudobject(serialise(formula_to_id))
예제 #10
0
 def _store(segm_i):
     id = chunk_i * n_threads + segm_i
     print(f'Storing formulas segment {id}')
     return storage.put_cloudobject(serialise(segm_list[segm_i]))
예제 #11
0
 def save(self, data, key):
     self.storage.put_object(self.bucket, self.resolve_key(key),
                             serialise(data))
예제 #12
0
 def _upload(path):
     mol_sfs = sorted(set(pd.read_csv(path).sf))
     return storage.put_cloudobject(serialise(mol_sfs))
예제 #13
0
 def _second_level_upload(df):
     return storage.put_cloudobject(serialise(df))
예제 #14
0
 def _first_level_upload(args):
     segm_i, df = args
     del df['segm_i']
     return segm_i, storage.put_cloudobject(serialise(df))
예제 #15
0
 def _store(chunk_i):
     return chunk_i, storage.put_cloudobject(
         serialise(formulas_chunks[chunk_i]))
예제 #16
0
 def get_portable_imzml_reader(storage):
     imzml_stream = storage.get_cloudobject(imzml_cobject, stream=True)
     parser = ImzMLParser(imzml_stream, ibd_file=None)
     imzml_reader = parser.portable_spectrum_reader()
     imzml_reader_cobject = storage.put_cloudobject(serialise(imzml_reader))
     return imzml_reader, imzml_reader_cobject
예제 #17
0
 def _store_db_data(db_data):
     return storage.put_cloudobject(serialise(db_data))
예제 #18
0
 def _store(segm):
     return storage.put_cloudobject(serialise(segm))