def save(self, db_data_cobjs: List[CObj[DbFDRData]],
             peaks_cobjs: List[CObj[pd.DataFrame]]):
        def batch_copy(src_cobjs: List[CloudObject], dest_prefix: str, *,
                       storage: Storage):
            # If Lithops' storage supported Copy Object operations, this could be easily optimized.
            # Not sure if it's worth the effort yet
            result_cobjs = []
            for i, data in enumerate(
                    iter_cobjects_with_prefetch(storage, src_cobjs)):
                dest_key = f'{dest_prefix}/{i:06}'
                result_cobjs.append(
                    storage.put_cloudobject(data, dest_bucket, dest_key))
            return result_cobjs

        dest_bucket = self.bucket
        # Copy cobjs to the cache dir
        new_db_data_cobjs, new_peaks_cobjs = self.executor.map(
            batch_copy,
            [(db_data_cobjs, f'{self.prefix}/db_data'),
             (peaks_cobjs, f'{self.prefix}/peaks')],
            runtime_memory=1024,
        )

        # Save config in case it's needed for debugging
        self.storage.put_cloudobject(json.dumps(self.ds_config, indent=4),
                                     self.bucket, self.config_key)
        # Save list of cobjects. This list would be easy to reconstruct by listing keys, but
        # saving a separate object as the last step of the process is helpful to confirm that
        # the cache item is complete, and didn't partially fail to copy.
        save_cobj(self.storage, (new_db_data_cobjs, new_peaks_cobjs),
                  self.bucket, self.meta_key)

        return new_db_data_cobjs, new_peaks_cobjs
Beispiel #2
0
def test_get_moldb_centroids(LockMock, executor: Executor, sm_config,
                             ds_config):
    formulas0 = ['H2O', 'CO2']
    formulas1 = ['H2SO4', 'CO2']
    formulas2 = ['H2SO4', 'NH4']
    moldbs: List[InputMolDb] = [
        {
            'id': 0,
            'targeted': False,
            'cobj': save_cobj(executor.storage, formulas0)
        },
        {
            'id': 1,
            'targeted': True,
            'cobj': save_cobj(executor.storage, formulas1)
        },
        {
            'id': 2,
            'targeted': False,
            'cobj': save_cobj(executor.storage, formulas2)
        },
    ]

    db_data_cobjs, peaks_cobjs = get_moldb_centroids(
        executor,
        sm_config['lithops']['sm_storage'],
        ds_config,
        moldbs,
        debug_validate=True,
        use_cache=False,
    )

    db_data = load_cobjs(executor.storage, db_data_cobjs)
    peaks_df = pd.concat(load_cobjs(executor.storage, peaks_cobjs))

    map_df0 = db_data[0]['formula_map_df'].set_index(['formula', 'modifier'])
    map_df1 = db_data[1]['formula_map_df'].set_index(['formula', 'modifier'])
    map_df2 = db_data[2]['formula_map_df'].set_index(['formula', 'modifier'])
    h2o_formula_i0 = map_df0.loc[('H2O', '')].formula_i
    co2_formula_i0 = map_df0.loc[('CO2', '')].formula_i
    co2_formula_i1 = map_df1.loc[('CO2', '')].formula_i
    h2so4_formula_i1 = map_df1.loc[('H2SO4', '')].formula_i
    h2so4_formula_i2 = map_df2.loc[('H2SO4', '')].formula_i
    assert co2_formula_i0 == co2_formula_i1, 'formula_i values should be de-duplicated'
    assert h2so4_formula_i1 == h2so4_formula_i2, 'formula_i values should be de-duplicated'
    assert co2_formula_i0 != h2so4_formula_i2, 'formula_i values should not conflict'

    assert not peaks_df.loc[h2o_formula_i0].targeted.any(
    ), "H2O shouldn't be targeted as it's not in a targeted DB"
    assert peaks_df.loc[co2_formula_i0].targeted.any(
    ), "CO2 should be targeted as it's in a targeted DB"
    assert peaks_df.loc[h2so4_formula_i1].targeted.any(
    ), "H2SO4 should be targeted as it's in a targeted DB"
Beispiel #3
0
 def upload_segm(start_end):
     start, end = start_end
     df = pd.DataFrame(
         {'mz': mzs[start:end], 'int': ints[start:end], 'sp_i': sp_idxs[start:end]},
         index=pd.RangeIndex(start, end),
     )
     return save_cobj(storage, df)
Beispiel #4
0
def _upload_moldbs_from_db(moldb_ids, storage, sm_storage):
    moldb_defs = []
    bucket, prefix = sm_storage['moldb']
    # Sort the moldbs because the centroids cache key is affected by their order
    for moldb_id in sorted(moldb_ids):
        key = f'{prefix}/{moldb_id}'
        try:
            storage.head_object(bucket, key)
            logger.debug(f'Found mol db at {key}')
            cobject = CloudObject(storage.backend, bucket, key)
        except StorageNoSuchKeyError:
            logger.info(f'Uploading {key}...')
            mols_query = DB().select(
                'SELECT DISTINCT formula FROM molecule WHERE moldb_id = %s',
                (moldb_id, ))
            mols = [mol for mol, in mols_query]
            cobject = save_cobj(storage, mols, bucket=bucket, key=key)
            logger.info(f'Uploading {key}...Done')
        (targeted, ) = DB().select_one(
            'SELECT targeted FROM molecular_db WHERE id = %s', (moldb_id, ))
        moldb_defs.append({
            'id': moldb_id,
            'cobj': cobject,
            'targeted': targeted
        })

    return moldb_defs
    def calculate_peaks_chunk(segm_i: int, segm_cobject: CObj[pd.DataFrame], *, storage: Storage):
        print(f'Calculating peaks from formulas chunk {segm_i}')
        chunk_df = load_cobj(storage, segm_cobject)
        chunk_iter = chunk_df[['ion_formula', 'target', 'targeted']].itertuples(True, None)
        peaks = list(chain(*map(calculate_peaks_for_formula, chunk_iter)))
        peaks_df = pd.DataFrame(
            peaks, columns=['formula_i', 'peak_i', 'mz', 'int', 'target', 'targeted']
        )
        peaks_df = peaks_df.astype(
            {
                'formula_i': 'u4',
                'peak_i': 'u1',
                'mz': 'f8',
                'int': 'f4',
                'target': '?',
                'targeted': '?',
            }
        )

        peaks_df.set_index('formula_i', inplace=True)

        print(f'Storing centroids chunk {segm_i}')
        peaks_cobject = save_cobj(storage, peaks_df)

        return peaks_cobject, peaks_df.shape[0]
 def run_coloc_job(moldb_id, image_ids, ion_ids, fdrs, *, storage):
     # Use web_app_url to get the publicly-exposed storage server address, because
     # Functions can't use the private address
     images, h, w = _get_images(ImageStorage(sm_config), ds_id, image_ids)
     cobjs = []
     for job in analyze_colocalization(ds_id, moldb_id, images, ion_ids, fdrs, h, w):
         cobjs.append(save_cobj(storage, job))
     return cobjs
Beispiel #7
0
    def clip_centr_df_chunk(peaks_i, peaks_cobject, storage):
        print(f'Clipping centroids dataframe chunk {peaks_i}')
        centroids_df_chunk = load_cobj(storage,
                                       peaks_cobject).sort_values('mz')
        centroids_df_chunk = centroids_df_chunk[centroids_df_chunk.mz > 0]

        ds_mz_range_unique_formulas = centroids_df_chunk[
            (mz_min < centroids_df_chunk.mz)
            & (centroids_df_chunk.mz < mz_max)].index.unique()
        centr_df_chunk = centroids_df_chunk[centroids_df_chunk.index.isin(
            ds_mz_range_unique_formulas)].reset_index()
        clip_centr_chunk_cobject = save_cobj(storage, centr_df_chunk)

        return clip_centr_chunk_cobject, centr_df_chunk.shape[0]
    def save_png_chunk(df: pd.DataFrame, *, storage: Storage):
        pngs = []
        groups = defaultdict(lambda: [])
        for formula_i, cobj in df.cobj.items():
            groups[cobj].append(formula_i)

        image_dict_iter = iter_cobjs_with_prefetch(storage,
                                                   list(groups.keys()))
        for image_dict, formula_is in zip(image_dict_iter, groups.values()):
            for formula_i in formula_is:
                formula_pngs = [
                    png_generator.generate_png(img.toarray())
                    if img is not None else None
                    for img in image_dict[formula_i]
                ]
                pngs.append((formula_i, formula_pngs))
        return save_cobj(storage, pngs)
Beispiel #9
0
def _upload_moldbs_from_files(file_paths, storage, sm_storage):
    moldb_defs = []
    for file_path in file_paths:
        bucket, raw_key = _choose_cos_location(file_path, sm_storage, 'moldb')
        key = raw_key + '_formulas'
        try:
            storage.head_object(bucket, key)
            logger.debug(f'Found mol db at {key}')
            cobject = CloudObject(storage.backend, bucket, key)
        except StorageNoSuchKeyError:
            logger.info(f'Uploading {key}...')
            mols = read_moldb_file(file_path).formula
            cobject = save_cobj(storage, mols, bucket=bucket, key=key)
            logger.info(f'Uploading {key}...Done')
        moldb_defs.append({
            'id': Path(file_path).stem,
            'cobj': cobject,
            'targeted': False
        })

    return moldb_defs
Beispiel #10
0
 def _second_level_upload(formula_is):
     return save_cobj(
         storage, segm.loc[formula_is].sort_values('mz').reset_index())
Beispiel #11
0
 def _first_level_upload(args):
     segm_i, df = args
     del df['segm_i']
     return segm_i, save_cobj(storage, df)