コード例 #1
0
def define_centr_segments(
    fexec: Executor,
    clip_centr_chunks_cobjs: List[CloudObject],
    centr_n: int,
    ds_size_mb: int,
):
    logger.info('Defining centroids segments bounds')

    def get_first_peak_mz(idx, cobject, *, storage):
        print(
            f'Extracting first peak mz values from clipped centroids dataframe {idx}'
        )
        centr_df = load_cobj(storage, cobject)
        first_peak_df = centr_df[centr_df.peak_i == 0]
        return first_peak_df.mz.values

    first_peak_df_mz = np.concatenate(
        fexec.map(get_first_peak_mz,
                  list(enumerate(clip_centr_chunks_cobjs)),
                  runtime_memory=512))

    data_per_centr_segm_mb = 50
    peaks_per_centr_segm = 10000
    centr_segm_n = int(
        max(ds_size_mb // data_per_centr_segm_mb,
            centr_n // peaks_per_centr_segm, MIN_CENTR_SEGMS))

    segm_bounds_q = [i * 1 / centr_segm_n for i in range(0, centr_segm_n)]
    centr_segm_lower_bounds = np.quantile(first_peak_df_mz, segm_bounds_q)

    logger.info(
        f'Generated {len(centr_segm_lower_bounds)} centroids bounds: '
        f'{centr_segm_lower_bounds[0]}...{centr_segm_lower_bounds[-1]}')
    return centr_segm_lower_bounds
コード例 #2
0
def store_images_to_s3(
    executor: Executor,
    ds_id: str,
    formula_i_to_db_id: pd.Series,
    png_cobjs: List[CObj[List[Tuple[int, bytes]]]],
) -> DbFormulaImagesDict:
    """
    Upload PNG isotopic images to S3 image storage. Images may be uploaded multiple times if a
    formula_i is in multiple databases (i.e. there are duplicates in the formula_i_to_db_id index).
    This is intentional, as there's no check for reused images when deleting individual dataset jobs
    e.g. by removing a moldb without reprocessing. It's easier to just avoid ever reusing images.
    """
    sm_config = SMConfig.get_conf()

    def _upload_png_batch(png_cobj: CObj[List[Tuple[int, bytes]]], *,
                          storage: Storage, perf: SubtaskProfiler):
        def _upload_images(pngs):
            return [
                image_storage.post_image(image_storage.ISO, ds_id, png)
                if png is not None else None for png in pngs
            ]

        formula_png_chunk = load_cobj(storage, png_cobj)
        image_storage = ImageStorage(sm_config)
        n_images = 0

        tasks = (pd.DataFrame(formula_png_chunk,
                              columns=['formula_i',
                                       'pngs']).set_index('formula_i').join(
                                           formula_i_to_db_id, how='inner'))
        # Limit parallelism to 6 to avoid accidentally hitting S3's upload limit (3500 PUTs/s)
        # The default parallelism (8 threads, because Cloud Functions get 4 CPUs) is slightly
        # too high for datasets with a lot of images.
        with ThreadPoolExecutor(6) as executor:
            db_formula_image_ids: DbFormulaImagesDict = defaultdict(dict)

            for db_id, formula_id, image_ids in zip(
                    tasks.moldb_id, tasks.index,
                    executor.map(_upload_images, tasks.pngs)):
                db_formula_image_ids[db_id][formula_id] = image_ids
                n_images += len([i for i in image_ids if i is not None])

        perf.add_extra_data(n_tasks=len(tasks), n_images=n_images)

        return db_formula_image_ids

    results = executor.map(_upload_png_batch, [(cobj, ) for cobj in png_cobjs],
                           runtime_memory=512)
    db_formula_image_ids: DbFormulaImagesDict = defaultdict(dict)
    for result in results:
        for db_id, db_result in result.items():
            db_formula_image_ids[db_id].update(db_result)

    return db_formula_image_ids
コード例 #3
0
def filter_results_and_make_pngs(
    fexec: Executor,
    formula_metrics_df: pd.DataFrame,
    moldbs: List[InputMolDb],
    fdrs: Dict[int, pd.DataFrame],
    images_df: pd.DataFrame,
    imzml_reader: LithopsImzMLReader,
):
    results_dfs = {}
    all_formula_is = set()
    for moldb_id, fdr in fdrs.items():
        result_df = (
            # Drop any columns already in fdr, as the FDR results may add or overwrite columns
            # with values from the scoring function.
            formula_metrics_df.drop(columns=fdr.columns, errors='ignore').join(
                fdr, how='inner').sort_values('fdr'))
        # Filter out zero-MSM annotations again to ensure that untargeted databases don't get
        # zero-MSM annotations, even if they have some overlap with targeted databases.
        is_targeted = any(db['targeted'] for db in moldbs
                          if db['id'] == moldb_id)
        if not is_targeted:
            result_df = result_df[(result_df.msm > 0) & (result_df.fdr < 1)]
        results_dfs[moldb_id] = result_df
        all_formula_is.update(results_dfs[moldb_id].index)

    image_tasks_df = images_df[images_df.index.isin(all_formula_is)].copy()
    jobs = _split_png_jobs(image_tasks_df, imzml_reader.w, imzml_reader.h)
    png_generator = PngGenerator(imzml_reader.mask)

    def save_png_chunk(df: pd.DataFrame, *, storage: Storage):
        pngs = []
        groups = defaultdict(lambda: [])
        for formula_i, cobj in df.cobj.items():
            groups[cobj].append(formula_i)

        image_dict_iter = iter_cobjs_with_prefetch(storage,
                                                   list(groups.keys()))
        for image_dict, formula_is in zip(image_dict_iter, groups.values()):
            for formula_i in formula_is:
                formula_pngs = [
                    png_generator.generate_png(img.toarray())
                    if img is not None else None
                    for img in image_dict[formula_i]
                ]
                pngs.append((formula_i, formula_pngs))
        return save_cobj(storage, pngs)

    png_cobjs = fexec.map(save_png_chunk,
                          jobs,
                          include_modules=['png'],
                          runtime_memory=1024)

    return results_dfs, png_cobjs
コード例 #4
0
ファイル: run_fdr.py プロジェクト: metaspace2020/metaspace
def run_fdr(
    executor: Executor,
    formula_scores_df: pd.DataFrame,
    db_data_cobjs: List[CObj[DbFDRData]],
    ds_config: DSConfig,
) -> Dict[int, pd.DataFrame]:
    def _run_fdr_for_db(db_data_cobject: CObj[DbFDRData], *, storage: Storage):
        print(f'Loading FDR data from {db_data_cobject}')
        db_data = load_cobj(storage, db_data_cobject)
        moldb_id = db_data['id']
        fdr = db_data['fdr']
        formula_map_df = db_data['formula_map_df']

        formula_msm = formula_map_df.merge(formula_scores_df,
                                           how='inner',
                                           left_on='formula_i',
                                           right_index=True)
        modifiers = fdr.target_modifiers_df[[
            'chem_mod', 'neutral_loss', 'adduct'
        ]]
        results_df = (fdr.estimate_fdr(formula_msm, scoring_model).assign(
            moldb_id=moldb_id).set_index('formula_i').merge(modifiers,
                                                            left_on='modifier',
                                                            right_index=True,
                                                            how='outer'))

        return db_data['id'], results_df

    logger.info('Estimating FDRs...')
    scoring_model = load_scoring_model(ds_config['fdr'].get('scoring_model'))

    args = [(db_data_cobj, ) for db_data_cobj in db_data_cobjs]
    results = executor.map(_run_fdr_for_db, args, runtime_memory=2048)

    for moldb_id, moldb_fdrs in results:
        logger.info(f'DB {moldb_id} number of annotations with FDR less than:')
        for fdr_step in [0.05, 0.1, 0.2, 0.5]:
            logger.info(
                f'{fdr_step * 100:2.0f}%: {(moldb_fdrs.fdr <= fdr_step).sum()}'
            )

    return dict(results)
コード例 #5
0
def segment_centroids(
    fexec: Executor,
    peaks_cobjs: List[CObj[pd.DataFrame]],
    ds_segms_cobjs: List[CObj[pd.DataFrame]],
    ds_segms_bounds: np.ndarray,
    ds_segm_size_mb: int,
    is_intensive_dataset: bool,
    isocalc_wrapper: IsocalcWrapper,
) -> List[CObj[pd.DataFrame]]:
    # pylint: disable=too-many-locals
    mz_min, mz_max = ds_segms_bounds[0, 0], ds_segms_bounds[-1, 1]

    clip_centr_chunks_cobjs, centr_n = clip_centr_df(fexec, peaks_cobjs,
                                                     mz_min, mz_max)

    # define first level segmentation and then segment each one into desired number

    centr_segm_lower_bounds = define_centr_segments(
        fexec,
        clip_centr_chunks_cobjs,
        centr_n,
        len(ds_segms_cobjs) * ds_segm_size_mb,
    )
    first_level_centr_segm_n = min(32, len(centr_segm_lower_bounds))
    centr_segm_lower_bounds = np.array_split(centr_segm_lower_bounds,
                                             first_level_centr_segm_n)
    first_level_centr_segm_bounds = np.array(
        [bounds[0] for bounds in centr_segm_lower_bounds])

    def segment_centr_df(centr_df, db_segm_lower_bounds):
        first_peak_df = centr_df[centr_df.peak_i == 0].copy()
        segment_mapping = (np.searchsorted(
            db_segm_lower_bounds, first_peak_df.mz.values, side='right') - 1)
        first_peak_df['segm_i'] = segment_mapping
        centr_segm_df = pd.merge(centr_df,
                                 first_peak_df[['formula_i', 'segm_i']],
                                 on='formula_i').sort_values('mz')
        return centr_segm_df

    def segment_centr_chunk(idx, cobject, *, storage):
        print(f'Segmenting clipped centroids dataframe chunk {idx}')
        centr_df = load_cobj(storage, cobject)
        centr_segm_df = segment_centr_df(centr_df,
                                         first_level_centr_segm_bounds)

        def _first_level_upload(args):
            segm_i, df = args
            del df['segm_i']
            return segm_i, save_cobj(storage, df)

        with ThreadPoolExecutor(max_workers=128) as pool:
            sub_segms = list(centr_segm_df.groupby('segm_i'))
            sub_segms_cobjs = list(pool.map(_first_level_upload, sub_segms))

        return dict(sub_segms_cobjs)

    first_level_segms_cobjs = fexec.map(
        segment_centr_chunk,
        list(enumerate(clip_centr_chunks_cobjs)),
        runtime_memory=1024)

    def merge_centr_df_segments(segm_i, segm_cobjects, *, storage):
        print(f'Merging segment {segm_i} clipped centroids chunks')
        # Temporarily index by formula_i for faster filtering when saving
        segm = pd.concat(load_cobjs(storage,
                                    segm_cobjects)).set_index('formula_i')
        formula_segms_df = choose_ds_segments_per_formula(
            ds_segms_bounds, segm, isocalc_wrapper)

        # Try to balance formulas so that they all span roughly the same number of DS segments,
        # and have roughly the same number of formulas.
        max_segm_span = max((formula_segms_df.hi - formula_segms_df.lo).max(),
                            3)
        if is_intensive_dataset:
            max_segm_count = int(round(np.clip(centr_n / 1000, 1000, 5000)))
        else:
            max_segm_count = int(round(np.clip(centr_n / 1000, 1000, 15000)))
        formula_i_groups = []
        segm_lo_idx = 0
        while segm_lo_idx < len(formula_segms_df):
            max_segm_hi = formula_segms_df.lo[segm_lo_idx] + max_segm_span + 1
            max_span_idx = np.searchsorted(formula_segms_df.hi, max_segm_hi,
                                           'left')
            segm_hi_idx = min(segm_lo_idx + max_segm_count, max_span_idx,
                              len(formula_segms_df))
            formula_i_groups.append(
                formula_segms_df.formula_i.values[segm_lo_idx:segm_hi_idx])
            print(segm_lo_idx, segm_hi_idx)
            segm_lo_idx = segm_hi_idx

        def _second_level_upload(formula_is):
            return save_cobj(
                storage, segm.loc[formula_is].sort_values('mz').reset_index())

        print(f'Storing {len(formula_i_groups)} centroids segments')
        with ThreadPoolExecutor(max_workers=4) as pool:
            segms_cobjects = list(
                pool.map(_second_level_upload, formula_i_groups))

        return segms_cobjects

    second_level_segms_dict = defaultdict(list)
    for sub_segms_cobjs in first_level_segms_cobjs:
        for first_level_segm_i in sub_segms_cobjs:
            second_level_segms_dict[first_level_segm_i].append(
                sub_segms_cobjs[first_level_segm_i])
    second_level_segms_cobjs = sorted(second_level_segms_dict.items(),
                                      key=lambda x: x[0])

    first_level_cobjs = [
        co for cos in first_level_segms_cobjs for co in cos.values()
    ]

    db_segms_cobjs = fexec.map_concat(merge_centr_df_segments,
                                      second_level_segms_cobjs,
                                      runtime_memory=512)

    fexec.storage.delete_cloudobjects(first_level_cobjs)

    return db_segms_cobjs
コード例 #6
0
def validate_centroids(fexec: Executor, peaks_cobjs: List[CObj[pd.DataFrame]]):
    # Ignore code duplicated with validate_centroid_segments as the duplicated parts of the code
    # are too entangled with non-duplicated parts of the code

    def warn(message, df=None):
        warnings.append(message)
        logger.warning(message)
        if df:
            logger.warning(df)

    def get_segm_stats(segm_cobject: CObj[pd.DataFrame], *, storage: Storage):
        segm = load_cobj(storage, segm_cobject)
        n_peaks = segm.groupby(level='formula_i').peak_i.count()
        formula_is = segm.index.unique()
        stats = pd.Series(
            {
                'min_mz': segm.mz.min(),
                'max_mz': segm.mz.max(),
                'min_formula_i': segm.index.min(),
                'max_formula_i': segm.index.max(),
                'avg_n_peaks': n_peaks.mean(),
                'min_n_peaks': n_peaks.min(),
                'max_n_peaks': n_peaks.max(),
                'max_int': segm.int.max(),
                'missing_peaks': (
                    segm.loc[n_peaks.index[n_peaks != 4]]
                    .groupby(level='formula_i')
                    .peak_i.apply(lambda peak_is: len(set(range(len(peak_is))) - set(peak_is)))
                    .sum()
                ),
                'n_formulas': len(formula_is),
                'n_peaks': len(segm),
            }
        )
        return formula_is, stats

    warnings: List[str] = []
    results = fexec.map(get_segm_stats, [(co,) for co in peaks_cobjs], runtime_memory=1024)
    segm_formula_is = [formula_is for formula_is, stats in results]
    stats_df = pd.DataFrame([stats for formula_is, stats in results])

    with pd.option_context(
        'display.max_rows', None, 'display.max_columns', None, 'display.width', 1000
    ):
        # Report cases with fewer peaks than expected (indication that formulas are being
        # split between multiple segments)
        wrong_n_peaks = stats_df[
            (stats_df.avg_n_peaks < 3.9) | (stats_df.min_n_peaks < 2) | (stats_df.max_n_peaks > 4)
        ]
        if not wrong_n_peaks.empty:
            warn(
                'segment_centroids produced segments with unexpected peaks-per-formula '
                '(should be almost always 4, occasionally 2 or 3):',
                wrong_n_peaks,
            )

        # Report missing peaks
        missing_peaks = stats_df[stats_df.missing_peaks > 0]
        if not missing_peaks.empty:
            warn('segment_centroids produced segments with missing peaks:', missing_peaks)

        formula_in_segms_df = validate_formulas_not_in_multiple_segms(segm_formula_is, warn)

        logger.debug(
            f'Found {stats_df.n_peaks.sum()} peaks for {stats_df.n_formulas.sum()} formulas '
            f'across {len(peaks_cobjs)} segms'
        )
        n_per_segm = formula_in_segms_df.groupby('segm_i').formula_i.count()
        logger.debug(f'Segm sizes range from {n_per_segm.min()} to {n_per_segm.max()}')

        if warnings:
            try:
                __import__('__main__').stats_df = stats_df
                print('validate_centroids debug info written to "stats_df" variable')
            except Exception:
                pass

            raise AssertionError('Some checks failed in validate_centroids')