Example #1
0
def executor(sm_config):
    from sm.engine.annotation_lithops.executor import Executor

    executor = Executor(sm_config['lithops'], debug_run_locally=True)

    yield executor

    executor.clean()
    for bucket, prefix in sm_config['lithops']['sm_storage'].values():
        keys = executor.storage.list_keys(bucket, prefix)
        if keys:
            executor.storage.delete_objects(bucket, keys)
Example #2
0
def run(sm_config, ds_id_str, sql_where, algorithm, use_lithops):
    db = DB()

    if sql_where:
        ds_ids = [
            id for (id, ) in db.select(
                f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}')
        ]
    else:
        ds_ids = ds_id_str.split(',')

    if not ds_ids:
        logger.warning('No datasets match filter')
        return

    if use_lithops:
        executor = Executor(sm_config['lithops'])

    for i, ds_id in enumerate(ds_ids):
        try:
            logger.info(
                f'[{i+1} / {len(ds_ids)}] Generating ion thumbnail for {ds_id}'
            )
            ds = Dataset.load(db, ds_id)
            if use_lithops:
                # noinspection PyUnboundLocalVariable
                generate_ion_thumbnail_lithops(executor,
                                               db,
                                               ds,
                                               algorithm=algorithm)
            else:
                generate_ion_thumbnail(db, ds, algorithm=algorithm)
        except Exception:
            logger.error(f'Failed on {ds_id}', exc_info=True)
Example #3
0
def define_centr_segments(
    fexec: Executor,
    clip_centr_chunks_cobjs: List[CloudObject],
    centr_n: int,
    ds_size_mb: int,
):
    logger.info('Defining centroids segments bounds')

    def get_first_peak_mz(idx, cobject, *, storage):
        print(
            f'Extracting first peak mz values from clipped centroids dataframe {idx}'
        )
        centr_df = load_cobj(storage, cobject)
        first_peak_df = centr_df[centr_df.peak_i == 0]
        return first_peak_df.mz.values

    first_peak_df_mz = np.concatenate(
        fexec.map(get_first_peak_mz,
                  list(enumerate(clip_centr_chunks_cobjs)),
                  runtime_memory=512))

    data_per_centr_segm_mb = 50
    peaks_per_centr_segm = 10000
    centr_segm_n = int(
        max(ds_size_mb // data_per_centr_segm_mb,
            centr_n // peaks_per_centr_segm, MIN_CENTR_SEGMS))

    segm_bounds_q = [i * 1 / centr_segm_n for i in range(0, centr_segm_n)]
    centr_segm_lower_bounds = np.quantile(first_peak_df_mz, segm_bounds_q)

    logger.info(
        f'Generated {len(centr_segm_lower_bounds)} centroids bounds: '
        f'{centr_segm_lower_bounds[0]}...{centr_segm_lower_bounds[-1]}')
    return centr_segm_lower_bounds
Example #4
0
def load_ds(
    executor: Executor, imzml_cobject: CloudObject, ibd_cobject: CloudObject, ds_segm_size_mb: int
) -> Tuple[LithopsImzMLReader, np.ndarray, List[CObj[pd.DataFrame]], np.ndarray,]:
    try:
        ibd_head = executor.storage.head_object(ibd_cobject.bucket, ibd_cobject.key)
        ibd_size_mb = int(ibd_head['content-length']) / 1024 // 1024
    except Exception:
        logger.warning("Couldn't read ibd size", exc_info=True)
        ibd_size_mb = 1024

    # Guess the amount of memory needed. For the majority of datasets (no zero-intensity peaks,
    # separate m/z arrays per spectrum) approximately 3x the ibd file size is used during the
    # most memory-intense part (sorting the m/z array).
    if ibd_size_mb * 3 + 512 < 32 * 1024:
        logger.info(f'Found {ibd_size_mb}MB .ibd file. Trying serverless load_ds')
        runtime_memory = max(2048, int(2 ** np.ceil(np.log2(ibd_size_mb * 3 + 512))))
    else:
        logger.info(f'Found {ibd_size_mb}MB .ibd file. Using VM-based load_ds')
        runtime_memory = 128 * 1024

    imzml_reader, ds_segments_bounds, ds_segms_cobjs, ds_segm_lens = executor.call(
        _load_ds,
        (imzml_cobject, ibd_cobject, ds_segm_size_mb),
        runtime_memory=runtime_memory,
    )

    logger.info(f'Segmented dataset chunks into {len(ds_segms_cobjs)} segments')

    return imzml_reader, ds_segments_bounds, ds_segms_cobjs, ds_segm_lens
def reprocess_dataset_local(sm_src,
                            src_ds_id,
                            dst_ds_id,
                            update_metadata_func,
                            skip_existing=True,
                            use_cache=False):
    existing = get_dataset_diagnostics(dst_ds_id)
    if skip_existing and existing:
        print(f'Skipping {dst_ds_id}\n', end=None)
        return dst_ds_id, None

    smds = sm_src.dataset(id=src_ds_id)
    db = DB()
    ds_metadata, ds_config = update_metadata_func(smds.metadata, smds.config)

    ds = Dataset(
        id=dst_ds_id,
        name=smds.name,
        input_path=smds.s3dir,
        upload_dt=datetime.now(),
        metadata=ds_metadata,
        config=ds_config,
        status=DatasetStatus.QUEUED,
        status_update_dt=None,
        is_public=False,
    )
    ds.save(db, None, True)
    with perf_profile(db, 'annotate_lithops', dst_ds_id) as perf:
        executor = Executor(SMConfig.get_conf()['lithops'], perf=perf)
        job = ServerAnnotationJob(executor, ds, perf, use_cache=use_cache)
        job.pipe.use_db_cache = False
        job.run()
    return dst_ds_id
Example #6
0
def clip_centr_df(fexec: Executor, peaks_cobjs: List[CloudObject],
                  mz_min: float,
                  mz_max: float) -> Tuple[List[CObj[pd.DataFrame]], int]:
    def clip_centr_df_chunk(peaks_i, peaks_cobject, storage):
        print(f'Clipping centroids dataframe chunk {peaks_i}')
        centroids_df_chunk = load_cobj(storage,
                                       peaks_cobject).sort_values('mz')
        centroids_df_chunk = centroids_df_chunk[centroids_df_chunk.mz > 0]

        ds_mz_range_unique_formulas = centroids_df_chunk[
            (mz_min < centroids_df_chunk.mz)
            & (centroids_df_chunk.mz < mz_max)].index.unique()
        centr_df_chunk = centroids_df_chunk[centroids_df_chunk.index.isin(
            ds_mz_range_unique_formulas)].reset_index()
        clip_centr_chunk_cobject = save_cobj(storage, centr_df_chunk)

        return clip_centr_chunk_cobject, centr_df_chunk.shape[0]

    assert len(peaks_cobjs) > 0
    clip_centr_chunks_cobjs, centr_n = fexec.map_unpack(
        clip_centr_df_chunk,
        list(enumerate(peaks_cobjs)),
        runtime_memory=512,
    )

    clip_centr_chunks_cobjs = list(clip_centr_chunks_cobjs)
    centr_n = sum(centr_n)
    logger.info(f'Prepared {centr_n} centroids')
    return clip_centr_chunks_cobjs, centr_n
Example #7
0
    def __init__(
        self,
        imzml_cobject: CloudObject,
        ibd_cobject: CloudObject,
        moldbs: List[InputMolDb],
        ds_config: DSConfig,
        executor: Executor = None,
        lithops_config=None,
        cache_key=None,
        use_db_cache=True,
        use_db_mutex=True,
    ):
        lithops_config = lithops_config or SMConfig.get_conf()['lithops']
        self.lithops_config = lithops_config
        self._db = DB()
        self.imzml_cobject = imzml_cobject
        self.ibd_cobject = ibd_cobject
        self.moldbs = moldbs
        self.ds_config = ds_config
        self.isocalc_wrapper = IsocalcWrapper(ds_config)

        self.executor = executor or Executor(lithops_config)
        self.storage = self.executor.storage

        if cache_key is not None:
            self.cacher: Optional[PipelineCacher] = PipelineCacher(
                self.storage, cache_key, lithops_config)
        else:
            self.cacher = None

        self.use_db_cache = use_db_cache
        self.use_db_mutex = use_db_mutex
        self.ds_segm_size_mb = 128
Example #8
0
def store_images_to_s3(
    executor: Executor,
    ds_id: str,
    formula_i_to_db_id: pd.Series,
    png_cobjs: List[CObj[List[Tuple[int, bytes]]]],
) -> DbFormulaImagesDict:
    """
    Upload PNG isotopic images to S3 image storage. Images may be uploaded multiple times if a
    formula_i is in multiple databases (i.e. there are duplicates in the formula_i_to_db_id index).
    This is intentional, as there's no check for reused images when deleting individual dataset jobs
    e.g. by removing a moldb without reprocessing. It's easier to just avoid ever reusing images.
    """
    sm_config = SMConfig.get_conf()

    def _upload_png_batch(png_cobj: CObj[List[Tuple[int, bytes]]], *,
                          storage: Storage, perf: SubtaskProfiler):
        def _upload_images(pngs):
            return [
                image_storage.post_image(image_storage.ISO, ds_id, png)
                if png is not None else None for png in pngs
            ]

        formula_png_chunk = load_cobj(storage, png_cobj)
        image_storage = ImageStorage(sm_config)
        n_images = 0

        tasks = (pd.DataFrame(formula_png_chunk,
                              columns=['formula_i',
                                       'pngs']).set_index('formula_i').join(
                                           formula_i_to_db_id, how='inner'))
        # Limit parallelism to 6 to avoid accidentally hitting S3's upload limit (3500 PUTs/s)
        # The default parallelism (8 threads, because Cloud Functions get 4 CPUs) is slightly
        # too high for datasets with a lot of images.
        with ThreadPoolExecutor(6) as executor:
            db_formula_image_ids: DbFormulaImagesDict = defaultdict(dict)

            for db_id, formula_id, image_ids in zip(
                    tasks.moldb_id, tasks.index,
                    executor.map(_upload_images, tasks.pngs)):
                db_formula_image_ids[db_id][formula_id] = image_ids
                n_images += len([i for i in image_ids if i is not None])

        perf.add_extra_data(n_tasks=len(tasks), n_images=n_images)

        return db_formula_image_ids

    results = executor.map(_upload_png_batch, [(cobj, ) for cobj in png_cobjs],
                           runtime_memory=512)
    db_formula_image_ids: DbFormulaImagesDict = defaultdict(dict)
    for result in results:
        for db_id, db_result in result.items():
            db_formula_image_ids[db_id].update(db_result)

    return db_formula_image_ids
def filter_results_and_make_pngs(
    fexec: Executor,
    formula_metrics_df: pd.DataFrame,
    moldbs: List[InputMolDb],
    fdrs: Dict[int, pd.DataFrame],
    images_df: pd.DataFrame,
    imzml_reader: LithopsImzMLReader,
):
    results_dfs = {}
    all_formula_is = set()
    for moldb_id, fdr in fdrs.items():
        result_df = (
            # Drop any columns already in fdr, as the FDR results may add or overwrite columns
            # with values from the scoring function.
            formula_metrics_df.drop(columns=fdr.columns, errors='ignore').join(
                fdr, how='inner').sort_values('fdr'))
        # Filter out zero-MSM annotations again to ensure that untargeted databases don't get
        # zero-MSM annotations, even if they have some overlap with targeted databases.
        is_targeted = any(db['targeted'] for db in moldbs
                          if db['id'] == moldb_id)
        if not is_targeted:
            result_df = result_df[(result_df.msm > 0) & (result_df.fdr < 1)]
        results_dfs[moldb_id] = result_df
        all_formula_is.update(results_dfs[moldb_id].index)

    image_tasks_df = images_df[images_df.index.isin(all_formula_is)].copy()
    jobs = _split_png_jobs(image_tasks_df, imzml_reader.w, imzml_reader.h)
    png_generator = PngGenerator(imzml_reader.mask)

    def save_png_chunk(df: pd.DataFrame, *, storage: Storage):
        pngs = []
        groups = defaultdict(lambda: [])
        for formula_i, cobj in df.cobj.items():
            groups[cobj].append(formula_i)

        image_dict_iter = iter_cobjs_with_prefetch(storage,
                                                   list(groups.keys()))
        for image_dict, formula_is in zip(image_dict_iter, groups.values()):
            for formula_i in formula_is:
                formula_pngs = [
                    png_generator.generate_png(img.toarray())
                    if img is not None else None
                    for img in image_dict[formula_i]
                ]
                pngs.append((formula_i, formula_pngs))
        return save_cobj(storage, pngs)

    png_cobjs = fexec.map(save_png_chunk,
                          jobs,
                          include_modules=['png'],
                          runtime_memory=1024)

    return results_dfs, png_cobjs
Example #10
0
def run_fdr(
    executor: Executor,
    formula_scores_df: pd.DataFrame,
    db_data_cobjs: List[CObj[DbFDRData]],
    ds_config: DSConfig,
) -> Dict[int, pd.DataFrame]:
    def _run_fdr_for_db(db_data_cobject: CObj[DbFDRData], *, storage: Storage):
        print(f'Loading FDR data from {db_data_cobject}')
        db_data = load_cobj(storage, db_data_cobject)
        moldb_id = db_data['id']
        fdr = db_data['fdr']
        formula_map_df = db_data['formula_map_df']

        formula_msm = formula_map_df.merge(formula_scores_df,
                                           how='inner',
                                           left_on='formula_i',
                                           right_index=True)
        modifiers = fdr.target_modifiers_df[[
            'chem_mod', 'neutral_loss', 'adduct'
        ]]
        results_df = (fdr.estimate_fdr(formula_msm, scoring_model).assign(
            moldb_id=moldb_id).set_index('formula_i').merge(modifiers,
                                                            left_on='modifier',
                                                            right_index=True,
                                                            how='outer'))

        return db_data['id'], results_df

    logger.info('Estimating FDRs...')
    scoring_model = load_scoring_model(ds_config['fdr'].get('scoring_model'))

    args = [(db_data_cobj, ) for db_data_cobj in db_data_cobjs]
    results = executor.map(_run_fdr_for_db, args, runtime_memory=2048)

    for moldb_id, moldb_fdrs in results:
        logger.info(f'DB {moldb_id} number of annotations with FDR less than:')
        for fdr_step in [0.05, 0.1, 0.2, 0.5]:
            logger.info(
                f'{fdr_step * 100:2.0f}%: {(moldb_fdrs.fdr <= fdr_step).sum()}'
            )

    return dict(results)
    def annotate_lithops(self, ds: Dataset, del_first=False):
        if del_first:
            self.logger.warning(f'Deleting all results for dataset: {ds.id}')
            del_jobs(ds)
        ds.save(self._db, self._es)
        with perf_profile(self._db, 'annotate_lithops', ds.id) as perf:
            executor = Executor(self._sm_config['lithops'], perf=perf)

            ServerAnnotationJob(executor, ds, perf).run()

            if self._sm_config['services'].get('colocalization', True):
                Colocalization(self._db).run_coloc_job_lithops(executor, ds, reprocess=del_first)

            if self._sm_config['services'].get('ion_thumbnail', True):
                generate_ion_thumbnail_lithops(
                    executor=executor,
                    db=self._db,
                    ds=ds,
                    only_if_needed=not del_first,
                )
Example #12
0
def build_moldb(
    executor: Executor,
    ds_config: DSConfig,
    moldbs: List[InputMolDb],
) -> Tuple[List[CObj[pd.DataFrame]], List[CObj[DbFDRData]]]:
    def _build_moldb(
        *, storage: Storage, perf: Profiler
    ) -> Tuple[List[CObj[pd.DataFrame]], List[CObj[DbFDRData]]]:
        logger.info('Generating formulas...')
        db_data_cobjs, formulas_df = get_formulas_df(storage, ds_config,
                                                     moldbs)
        num_formulas = len(formulas_df)
        perf.record_entry('generated formulas', num_formulas=num_formulas)

        logger.info('Storing formulas...')
        formula_cobjs = store_formula_segments(storage, formulas_df)
        perf.record_entry('stored formulas', num_chunks=len(formula_cobjs))

        return formula_cobjs, db_data_cobjs

    return executor.call(_build_moldb, (), runtime_memory=4096)
Example #13
0
def generate_ion_thumbnail_lithops(
    executor: Executor,
    db,
    ds: Dataset,
    only_if_needed=False,
    algorithm=DEFAULT_ALGORITHM,
):
    try:
        (existing_thumb_id,) = db.select_one(THUMB_SEL, [ds.id])

        if existing_thumb_id and only_if_needed:
            return

        annotation_rows = db.select(ISO_IMAGE_SEL, [ds.id])

        if not annotation_rows:
            logger.warning('Could not create ion thumbnail - no annotations found')
            return

        ds_id = ds.id
        sm_config = SMConfig.get_conf()

        def generate(annotation_rows):
            return _generate_ion_thumbnail_image(
                image_storage.ImageStorage(sm_config), ds_id, annotation_rows, algorithm
            )

        thumbnail = executor.call(
            generate, (annotation_rows,), runtime_memory=2048, include_modules=['png']
        )

        image_id = _save_ion_thumbnail_image(ds.id, thumbnail)
        image_url = image_storage.get_image_url(image_storage.THUMB, ds.id, image_id)
        db.alter(THUMB_UPD, [image_id, image_url, ds.id])

        if existing_thumb_id:
            image_storage.delete_image(image_storage.THUMB, ds.id, existing_thumb_id)

    except Exception:
        logger.error('Error generating ion thumbnail image', exc_info=True)
Example #14
0
    def run_coloc_job_lithops(self, fexec: Executor, ds: Dataset, reprocess: bool = False):
        # Extract required fields to avoid pickling Dataset, because unpickling Dataset tries to
        # import psycopg2 and fails inside Functions
        ds_id = ds.id
        sm_config = self._sm_config

        def run_coloc_job(moldb_id, image_ids, ion_ids, fdrs, *, storage):
            # Use web_app_url to get the publicly-exposed storage server address, because
            # Functions can't use the private address
            images, h, w = _get_images(ImageStorage(sm_config), ds_id, image_ids)
            cobjs = []
            for job in analyze_colocalization(ds_id, moldb_id, images, ion_ids, fdrs, h, w):
                cobjs.append(save_cobj(storage, job))
            return cobjs

        tasks = list(self._iter_pending_coloc_tasks(ds.id, reprocess))
        cost_factors = pd.DataFrame({'n_images': [len(task[1]) for task in tasks]})
        job_cobjs = fexec.map_concat(
            run_coloc_job, tasks, cost_factors=cost_factors, runtime_memory=4096
        )

        for job in iter_cobjs_with_prefetch(fexec.storage, job_cobjs):
            self._save_job_to_db(job)
Example #15
0
    def run_search(self, store_images=False, use_lithops=False):
        if not store_images:
            self._patch_image_storage()

        moldb_id = molecular_db.find_by_name_version(self.moldb['name'],
                                                     self.moldb['version']).id

        os.environ['PYSPARK_PYTHON'] = sys.executable

        ds = create_ds_from_files(self.ds_id, self.ds_name, self.input_path)
        ds.config['analysis_version'] = self.analysis_version
        ds.config['fdr'][
            'scoring_model'] = 'v3_default' if self.analysis_version > 1 else None
        ds.config['database_ids'] = [moldb_id]

        self.db.alter('DELETE FROM job WHERE ds_id=%s', params=(ds.id, ))
        ds.save(self.db, allow_insert=True)
        perf = NullProfiler()
        if use_lithops:
            # Override the runtime to force it to run without docker.
            lithops_executor.RUNTIME_CF_VPC = 'python'
            lithops_executor.RUNTIME_CE = 'python'

            executor = Executor(self.sm_config['lithops'], perf)
            job = ServerAnnotationJob(
                executor,
                ds,
                perf,
                self.sm_config,
                store_images=store_images,
            )
            job.run(debug_validate=True)
        else:
            AnnotationJob(ds, perf).run()

        self.make_comparison_df()
def calculate_centroids(
    fexec: Executor, formula_cobjs: List[CObj[pd.DataFrame]], isocalc_wrapper: IsocalcWrapper
) -> List[CObj[pd.DataFrame]]:
    def calculate_peaks_for_formula(args):
        formula_i, formula, target, targeted = args
        mzs, ints = isocalc_wrapper.centroids(formula)
        if mzs is not None:
            return [
                (formula_i, peak_i, mzs[peak_i], ints[peak_i], target, targeted)
                for peak_i in range(len(mzs))
            ]
        return []

    def calculate_peaks_chunk(segm_i: int, segm_cobject: CObj[pd.DataFrame], *, storage: Storage):
        print(f'Calculating peaks from formulas chunk {segm_i}')
        chunk_df = load_cobj(storage, segm_cobject)
        chunk_iter = chunk_df[['ion_formula', 'target', 'targeted']].itertuples(True, None)
        peaks = list(chain(*map(calculate_peaks_for_formula, chunk_iter)))
        peaks_df = pd.DataFrame(
            peaks, columns=['formula_i', 'peak_i', 'mz', 'int', 'target', 'targeted']
        )
        peaks_df = peaks_df.astype(
            {
                'formula_i': 'u4',
                'peak_i': 'u1',
                'mz': 'f8',
                'int': 'f4',
                'target': '?',
                'targeted': '?',
            }
        )

        peaks_df.set_index('formula_i', inplace=True)

        print(f'Storing centroids chunk {segm_i}')
        peaks_cobject = save_cobj(storage, peaks_df)

        return peaks_cobject, peaks_df.shape[0]

    peaks_cobjs, peaks_cobject_lens = fexec.map_unpack(
        calculate_peaks_chunk,
        list(enumerate(formula_cobjs)),
        runtime_memory=2048,
    )

    num_centroids = sum(peaks_cobject_lens)
    logger.info(f'Calculated {num_centroids} centroids in {len(peaks_cobjs)} chunks')

    def _sort_peaks_cobjs(*, storage):
        df = pd.concat(load_cobjs(storage, peaks_cobjs))
        first_peak_mz = df.mz[df.peak_i == 0].sort_values()

        peaks_chunk_size = 64 * 2 ** 20
        n_chunks = int(np.ceil(df.memory_usage().sum() / peaks_chunk_size))
        cnt = len(first_peak_mz)
        chunks = (
            df.loc[first_peak_mz.index[cnt * i // n_chunks : cnt * (i + 1) // n_chunks]]
            for i in range(n_chunks)
        )

        return save_cobjs(storage, chunks)

    sorted_peaks_cobjs = fexec.call(
        _sort_peaks_cobjs,
        (),
        cost_factors={'num_centroids': num_centroids, 'num_peak_cobjects': len(peaks_cobjs)},
        runtime_memory=256 + 100 * num_centroids / 2 ** 20,
    )

    logger.info(f'Sorted centroids chunks into {len(sorted_peaks_cobjs)} chunks')
    return sorted_peaks_cobjs
Example #17
0
def segment_centroids(
    fexec: Executor,
    peaks_cobjs: List[CObj[pd.DataFrame]],
    ds_segms_cobjs: List[CObj[pd.DataFrame]],
    ds_segms_bounds: np.ndarray,
    ds_segm_size_mb: int,
    is_intensive_dataset: bool,
    isocalc_wrapper: IsocalcWrapper,
) -> List[CObj[pd.DataFrame]]:
    # pylint: disable=too-many-locals
    mz_min, mz_max = ds_segms_bounds[0, 0], ds_segms_bounds[-1, 1]

    clip_centr_chunks_cobjs, centr_n = clip_centr_df(fexec, peaks_cobjs,
                                                     mz_min, mz_max)

    # define first level segmentation and then segment each one into desired number

    centr_segm_lower_bounds = define_centr_segments(
        fexec,
        clip_centr_chunks_cobjs,
        centr_n,
        len(ds_segms_cobjs) * ds_segm_size_mb,
    )
    first_level_centr_segm_n = min(32, len(centr_segm_lower_bounds))
    centr_segm_lower_bounds = np.array_split(centr_segm_lower_bounds,
                                             first_level_centr_segm_n)
    first_level_centr_segm_bounds = np.array(
        [bounds[0] for bounds in centr_segm_lower_bounds])

    def segment_centr_df(centr_df, db_segm_lower_bounds):
        first_peak_df = centr_df[centr_df.peak_i == 0].copy()
        segment_mapping = (np.searchsorted(
            db_segm_lower_bounds, first_peak_df.mz.values, side='right') - 1)
        first_peak_df['segm_i'] = segment_mapping
        centr_segm_df = pd.merge(centr_df,
                                 first_peak_df[['formula_i', 'segm_i']],
                                 on='formula_i').sort_values('mz')
        return centr_segm_df

    def segment_centr_chunk(idx, cobject, *, storage):
        print(f'Segmenting clipped centroids dataframe chunk {idx}')
        centr_df = load_cobj(storage, cobject)
        centr_segm_df = segment_centr_df(centr_df,
                                         first_level_centr_segm_bounds)

        def _first_level_upload(args):
            segm_i, df = args
            del df['segm_i']
            return segm_i, save_cobj(storage, df)

        with ThreadPoolExecutor(max_workers=128) as pool:
            sub_segms = list(centr_segm_df.groupby('segm_i'))
            sub_segms_cobjs = list(pool.map(_first_level_upload, sub_segms))

        return dict(sub_segms_cobjs)

    first_level_segms_cobjs = fexec.map(
        segment_centr_chunk,
        list(enumerate(clip_centr_chunks_cobjs)),
        runtime_memory=1024)

    def merge_centr_df_segments(segm_i, segm_cobjects, *, storage):
        print(f'Merging segment {segm_i} clipped centroids chunks')
        # Temporarily index by formula_i for faster filtering when saving
        segm = pd.concat(load_cobjs(storage,
                                    segm_cobjects)).set_index('formula_i')
        formula_segms_df = choose_ds_segments_per_formula(
            ds_segms_bounds, segm, isocalc_wrapper)

        # Try to balance formulas so that they all span roughly the same number of DS segments,
        # and have roughly the same number of formulas.
        max_segm_span = max((formula_segms_df.hi - formula_segms_df.lo).max(),
                            3)
        if is_intensive_dataset:
            max_segm_count = int(round(np.clip(centr_n / 1000, 1000, 5000)))
        else:
            max_segm_count = int(round(np.clip(centr_n / 1000, 1000, 15000)))
        formula_i_groups = []
        segm_lo_idx = 0
        while segm_lo_idx < len(formula_segms_df):
            max_segm_hi = formula_segms_df.lo[segm_lo_idx] + max_segm_span + 1
            max_span_idx = np.searchsorted(formula_segms_df.hi, max_segm_hi,
                                           'left')
            segm_hi_idx = min(segm_lo_idx + max_segm_count, max_span_idx,
                              len(formula_segms_df))
            formula_i_groups.append(
                formula_segms_df.formula_i.values[segm_lo_idx:segm_hi_idx])
            print(segm_lo_idx, segm_hi_idx)
            segm_lo_idx = segm_hi_idx

        def _second_level_upload(formula_is):
            return save_cobj(
                storage, segm.loc[formula_is].sort_values('mz').reset_index())

        print(f'Storing {len(formula_i_groups)} centroids segments')
        with ThreadPoolExecutor(max_workers=4) as pool:
            segms_cobjects = list(
                pool.map(_second_level_upload, formula_i_groups))

        return segms_cobjects

    second_level_segms_dict = defaultdict(list)
    for sub_segms_cobjs in first_level_segms_cobjs:
        for first_level_segm_i in sub_segms_cobjs:
            second_level_segms_dict[first_level_segm_i].append(
                sub_segms_cobjs[first_level_segm_i])
    second_level_segms_cobjs = sorted(second_level_segms_dict.items(),
                                      key=lambda x: x[0])

    first_level_cobjs = [
        co for cos in first_level_segms_cobjs for co in cos.values()
    ]

    db_segms_cobjs = fexec.map_concat(merge_centr_df_segments,
                                      second_level_segms_cobjs,
                                      runtime_memory=512)

    fexec.storage.delete_cloudobjects(first_level_cobjs)

    return db_segms_cobjs
def validate_centroids(fexec: Executor, peaks_cobjs: List[CObj[pd.DataFrame]]):
    # Ignore code duplicated with validate_centroid_segments as the duplicated parts of the code
    # are too entangled with non-duplicated parts of the code

    def warn(message, df=None):
        warnings.append(message)
        logger.warning(message)
        if df:
            logger.warning(df)

    def get_segm_stats(segm_cobject: CObj[pd.DataFrame], *, storage: Storage):
        segm = load_cobj(storage, segm_cobject)
        n_peaks = segm.groupby(level='formula_i').peak_i.count()
        formula_is = segm.index.unique()
        stats = pd.Series(
            {
                'min_mz': segm.mz.min(),
                'max_mz': segm.mz.max(),
                'min_formula_i': segm.index.min(),
                'max_formula_i': segm.index.max(),
                'avg_n_peaks': n_peaks.mean(),
                'min_n_peaks': n_peaks.min(),
                'max_n_peaks': n_peaks.max(),
                'max_int': segm.int.max(),
                'missing_peaks': (
                    segm.loc[n_peaks.index[n_peaks != 4]]
                    .groupby(level='formula_i')
                    .peak_i.apply(lambda peak_is: len(set(range(len(peak_is))) - set(peak_is)))
                    .sum()
                ),
                'n_formulas': len(formula_is),
                'n_peaks': len(segm),
            }
        )
        return formula_is, stats

    warnings: List[str] = []
    results = fexec.map(get_segm_stats, [(co,) for co in peaks_cobjs], runtime_memory=1024)
    segm_formula_is = [formula_is for formula_is, stats in results]
    stats_df = pd.DataFrame([stats for formula_is, stats in results])

    with pd.option_context(
        'display.max_rows', None, 'display.max_columns', None, 'display.width', 1000
    ):
        # Report cases with fewer peaks than expected (indication that formulas are being
        # split between multiple segments)
        wrong_n_peaks = stats_df[
            (stats_df.avg_n_peaks < 3.9) | (stats_df.min_n_peaks < 2) | (stats_df.max_n_peaks > 4)
        ]
        if not wrong_n_peaks.empty:
            warn(
                'segment_centroids produced segments with unexpected peaks-per-formula '
                '(should be almost always 4, occasionally 2 or 3):',
                wrong_n_peaks,
            )

        # Report missing peaks
        missing_peaks = stats_df[stats_df.missing_peaks > 0]
        if not missing_peaks.empty:
            warn('segment_centroids produced segments with missing peaks:', missing_peaks)

        formula_in_segms_df = validate_formulas_not_in_multiple_segms(segm_formula_is, warn)

        logger.debug(
            f'Found {stats_df.n_peaks.sum()} peaks for {stats_df.n_formulas.sum()} formulas '
            f'across {len(peaks_cobjs)} segms'
        )
        n_per_segm = formula_in_segms_df.groupby('segm_i').formula_i.count()
        logger.debug(f'Segm sizes range from {n_per_segm.min()} to {n_per_segm.max()}')

        if warnings:
            try:
                __import__('__main__').stats_df = stats_df
                print('validate_centroids debug info written to "stats_df" variable')
            except Exception:
                pass

            raise AssertionError('Some checks failed in validate_centroids')
def run_coloc_jobs(
    sm_config, ds_id_str, sql_where, fix_missing, fix_corrupt, skip_existing, use_lithops
):
    assert (
        len(
            [
                data_source
                for data_source in [ds_id_str, sql_where, fix_missing, fix_corrupt]
                if data_source
            ]
        )
        == 1
    ), "Exactly one data source (ds_id, sql_where, fix_missing, fix_corrupt) must be specified"
    assert not (ds_id_str and sql_where)

    db = DB()

    if ds_id_str:
        ds_ids = ds_id_str.split(',')
    elif sql_where:
        ds_ids = [
            id for (id,) in db.select(f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}')
        ]
    else:
        mol_dbs = [
            (doc['id'], doc['name'])
            for doc in db.select_with_fields('SELECT id, name FROM molecular_db m')
        ]
        mol_db_ids, mol_db_names = map(list, zip(*mol_dbs))
        fdrs = [0.05, 0.1, 0.2, 0.5]
        algorithms = ['median_thresholded_cosine', 'cosine']

        if fix_missing:
            logger.info('Checking for missing colocalization jobs...')
            results = db.select(
                MISSING_COLOC_JOBS_SEL, [list(mol_db_ids), list(mol_db_names), fdrs, algorithms]
            )
            ds_ids = [ds_id for ds_id, in results]
            logger.info(f'Found {len(ds_ids)} missing colocalization sets')
        else:
            logger.info(
                'Checking all colocalization jobs. '
                'This is super slow: ~5 minutes per 1000 datasets...'
            )
            results = db.select(
                CORRUPT_COLOC_JOBS_SEL, [list(mol_db_ids), list(mol_db_names), fdrs, algorithms]
            )
            ds_ids = [ds_id for ds_id, in results]
            logger.info(f'Found {len(ds_ids)} corrupt colocalization sets')

    if not ds_ids:
        logger.warning('No datasets match filter')
        return

    if use_lithops:
        executor = Executor(sm_config['lithops'])

    for i, ds_id in enumerate(ds_ids):
        try:
            logger.info(f'Running colocalization on {i+1} out of {len(ds_ids)}')
            ds = Dataset.load(db, ds_id)
            coloc = Colocalization(db)
            if use_lithops:
                # noinspection PyUnboundLocalVariable
                coloc.run_coloc_job_lithops(executor, ds, reprocess=not skip_existing)
            else:
                coloc.run_coloc_job(ds, reprocess=not skip_existing)
        except Exception:
            logger.error(f'Failed to run colocalization on {ds_id}', exc_info=True)
Example #20
0
def test_server_annotation_job(test_db, executor: Executor, sm_config,
                               ds_config, metadata):
    db = DB()
    moldb_id = import_test_molecular_db()
    ds_config['database_ids'] = [moldb_id]
    ds_config['isotope_generation']['adducts'] = [
        '[M]+'
    ]  # test spectrum was made with no adduct
    # ds_config['isotope_generation']['n_peaks'] = 2  # minimize overlap between decoys and targets
    ds_config['image_generation'][
        'ppm'] = 0.001  # minimize overlap between decoys and targets
    ds_config['fdr']['decoy_sample_size'] = len(MOCK_DECOY_ADDUCTS)
    input_path = upload_test_imzml(executor.storage, sm_config, ds_config)

    ds = Dataset(
        id=datetime.now().strftime('%Y-%m-%d_%Hh%Mm%Ss'),
        name='Test Lithops Dataset',
        input_path=input_path,
        upload_dt=datetime.now(),
        metadata=metadata,
        config=ds_config,
        is_public=True,
        status=DatasetStatus.QUEUED,
    )
    ds.save(db, None, allow_insert=True)

    with perf_profile(db, 'test_lithops_annotate', ds.id) as perf:
        # Overwrite executor's NullProfiler with a real profiler
        executor._perf = perf
        job = ServerAnnotationJob(executor=executor, ds=ds, perf=perf)
        job.run(debug_validate=True)

    def db_df(sql, args):
        return pd.DataFrame(db.select_with_fields(sql, args))

    jobs = db_df('SELECT * FROM job WHERE ds_id = %s', (ds.id, ))
    anns = db_df(
        'SELECT * FROM annotation WHERE job_id = ANY(%s) ORDER BY msm DESC',
        (jobs.id.tolist(), ))
    diags = db_df('SELECT * FROM dataset_diagnostic WHERE ds_id = %s',
                  (ds.id, ))
    profiles = db_df('SELECT * FROM perf_profile WHERE ds_id = %s', (ds.id, ))
    profile_entries = db_df(
        'SELECT * FROM perf_profile_entry WHERE profile_id = ANY(%s)',
        (profiles.id.tolist(), ))
    # For debugging annotations / FDR-related issues
    debug_data = job.pipe.debug_get_annotation_data(MOCK_FORMULAS[0], '')
    # print(debug_data)
    # db_data = load_cobjs(executor.storage, job.pipe.db_data_cobjs)[0]
    # print(db_data)
    # print(load_cobjs(executor.storage, job.pipe.ds_segms_cobjs))
    # moldb = pd.concat(load_cobjs(executor.storage, job.pipe.db_segms_cobjs))
    # formula_mzs = moldb.groupby('formula_i').mz.apply(list)
    # all_metrics = (
    #     job.pipe.formula_metrics_df.join(db_data['formula_map_df'].set_index('formula_i'))
    #     .join(formula_mzs)
    #     .sort_values('msm', ascending=False)
    # )
    # print(all_metrics)
    # print(job.pipe.ds_segments_bounds)
    # print(job.pipe.ds_segm_lens)
    # print(job.pipe.fdrs)

    # print(pd.DataFrame(anns))
    # print(pd.DataFrame(diags))
    # print(pd.DataFrame(profiles))
    # print(pd.DataFrame(profile_entries))

    # Validate jobs
    assert len(jobs) == 1
    assert jobs.moldb_id[0] == moldb_id

    # Validate annotations
    assert np.array_equal(anns.formula,
                          MOCK_FORMULAS)  # Formulas should be MSM-descending
    assert np.array_equal(anns.fdr, [0.05] * 2 + [0.5] * 8)

    # Validate images were saved
    image_ids = [imgs[0] for imgs in anns.iso_image_ids]
    images = image_storage.get_ion_images_for_analysis(ds.id, image_ids)[0]
    assert images.shape == (len(anns), 4 * 4)
    # All non-masked pixels should have a value
    assert np.count_nonzero(images) == len(anns) * len(MOCK_COORDS)

    # Validate diagnostics
    metadata_diag = diags[diags.type == DiagnosticType.IMZML_METADATA].iloc[0]
    tic_diag = diags[diags.type == DiagnosticType.TIC].iloc[0]

    assert metadata_diag.error is None
    assert metadata_diag.data['n_spectra'] == len(MOCK_COORDS)
    assert metadata_diag.images[0]['key'] == DiagnosticImageKey.MASK
    mask_image = load_npy_image(ds.id, metadata_diag.images[0]['image_id'])
    assert np.count_nonzero(mask_image) == len(MOCK_COORDS)

    assert tic_diag.error is None
    assert tic_diag.data['min_tic'] > 0
    assert tic_diag.images[0]['key'] == DiagnosticImageKey.TIC
    tic_image = load_npy_image(ds.id, tic_diag.images[0]['image_id'])
    assert tic_image.dtype == np.float32
    assert tic_image.shape == (4, 4)
    assert np.array_equal(np.isnan(tic_image),
                          ~mask_image)  # Masked area should be NaNs
    assert (tic_image[mask_image] >
            0).all()  # Non-masked area should be non-zero

    # Validate perf profile
    assert len(profiles) == 1
    assert len(profile_entries) > 10