Esempio n. 1
0
def reprocess_dataset_local(sm_src,
                            src_ds_id,
                            dst_ds_id,
                            update_metadata_func,
                            skip_existing=True,
                            use_cache=False):
    existing = get_dataset_diagnostics(dst_ds_id)
    if skip_existing and existing:
        print(f'Skipping {dst_ds_id}\n', end=None)
        return dst_ds_id, None

    smds = sm_src.dataset(id=src_ds_id)
    db = DB()
    ds_metadata, ds_config = update_metadata_func(smds.metadata, smds.config)

    ds = Dataset(
        id=dst_ds_id,
        name=smds.name,
        input_path=smds.s3dir,
        upload_dt=datetime.now(),
        metadata=ds_metadata,
        config=ds_config,
        status=DatasetStatus.QUEUED,
        status_update_dt=None,
        is_public=False,
    )
    ds.save(db, None, True)
    with perf_profile(db, 'annotate_lithops', dst_ds_id) as perf:
        executor = Executor(SMConfig.get_conf()['lithops'], perf=perf)
        job = ServerAnnotationJob(executor, ds, perf, use_cache=use_cache)
        job.pipe.use_db_cache = False
        job.run()
    return dst_ds_id
Esempio n. 2
0
    def annotate_lithops(self, ds: Dataset, del_first=False):
        if del_first:
            self.logger.warning(f'Deleting all results for dataset: {ds.id}')
            del_jobs(ds)
        ds.save(self._db, self._es)
        with perf_profile(self._db, 'annotate_lithops', ds.id) as perf:
            executor = Executor(self._sm_config['lithops'], perf=perf)

            ServerAnnotationJob(executor, ds, perf).run()

            if self._sm_config['services'].get('colocalization', True):
                Colocalization(self._db).run_coloc_job_lithops(executor, ds, reprocess=del_first)

            if self._sm_config['services'].get('ion_thumbnail', True):
                generate_ion_thumbnail_lithops(
                    executor=executor,
                    db=self._db,
                    ds=ds,
                    only_if_needed=not del_first,
                )
Esempio n. 3
0
    def add(self, doc, use_lithops, **kwargs):
        """Save dataset and send ANNOTATE message to the queue."""
        now = datetime.now()
        if 'id' not in doc:
            doc['id'] = now.strftime('%Y-%m-%d_%Hh%Mm%Ss')

        ds_config_kwargs = dict(
            (k, v) for k, v in doc.items() if k in FLAT_DS_CONFIG_KEYS)

        try:
            ds = Dataset.load(self._db, doc['id'])
            self._set_ds_busy(ds, kwargs.get('force', False))
            config = update_ds_config(ds.config, doc['metadata'],
                                      **ds_config_kwargs)
        except UnknownDSID:
            config = generate_ds_config(doc.get('metadata'),
                                        **ds_config_kwargs)

        ds = Dataset(
            id=doc['id'],
            name=doc.get('name'),
            input_path=doc.get('input_path'),
            upload_dt=doc.get('upload_dt', now.isoformat()),
            metadata=doc.get('metadata'),
            config=config,
            is_public=doc.get('is_public'),
            status=DatasetStatus.QUEUED,
        )
        ds.save(self._db, self._es, allow_insert=True)
        self._status_queue.publish({
            'ds_id': ds.id,
            'action': DaemonAction.ANNOTATE,
            'stage': DaemonActionStage.QUEUED
        })

        queue = self._lit_queue if use_lithops else self._annot_queue
        self._post_sm_msg(ds=ds,
                          queue=queue,
                          action=DaemonAction.ANNOTATE,
                          **kwargs)
        return doc['id']
Esempio n. 4
0
def test_dataset_save_overwrite_ds_works(fill_db, sm_config, ds_config):
    db = DB(sm_config['db'])
    es_mock = MagicMock(spec=ESExporter)
    status_queue_mock = MagicMock(spec=QueuePublisher)

    upload_dt = datetime.now()
    ds_id = '2000-01-01'
    ds = Dataset(ds_id,
                 'ds_name',
                 'input_path',
                 upload_dt, {},
                 ds_config,
                 mol_dbs=['HMDB'],
                 adducts=['+H'])

    ds.save(db, es_mock, status_queue_mock)

    assert ds == Dataset.load(db, ds_id)
    es_mock.sync_dataset.assert_called_once_with(ds_id)
    status_queue_mock.publish.assert_called_with({
        'ds_id': ds_id,
        'status': DatasetStatus.NEW
    })
Esempio n. 5
0
def create_test_ds(
    id='2000-01-01',
    name='ds_name',
    input_path='input_path',
    upload_dt=None,
    metadata=None,
    config=None,
    status=DatasetStatus.FINISHED,
    es=None,
):
    upload_dt = upload_dt or datetime.now()

    ds = Dataset(
        id=id,
        name=name,
        input_path=input_path,
        upload_dt=upload_dt or datetime.now(),
        metadata=metadata or deepcopy(TEST_METADATA),
        config=config or deepcopy(TEST_DS_CONFIG),
        status=status or DatasetStatus.QUEUED,
    )
    ds.save(DB(), es=es, allow_insert=True)
    return ds
Esempio n. 6
0
def test_server_annotation_job(test_db, executor: Executor, sm_config,
                               ds_config, metadata):
    db = DB()
    moldb_id = import_test_molecular_db()
    ds_config['database_ids'] = [moldb_id]
    ds_config['isotope_generation']['adducts'] = [
        '[M]+'
    ]  # test spectrum was made with no adduct
    # ds_config['isotope_generation']['n_peaks'] = 2  # minimize overlap between decoys and targets
    ds_config['image_generation'][
        'ppm'] = 0.001  # minimize overlap between decoys and targets
    ds_config['fdr']['decoy_sample_size'] = len(MOCK_DECOY_ADDUCTS)
    input_path = upload_test_imzml(executor.storage, sm_config, ds_config)

    ds = Dataset(
        id=datetime.now().strftime('%Y-%m-%d_%Hh%Mm%Ss'),
        name='Test Lithops Dataset',
        input_path=input_path,
        upload_dt=datetime.now(),
        metadata=metadata,
        config=ds_config,
        is_public=True,
        status=DatasetStatus.QUEUED,
    )
    ds.save(db, None, allow_insert=True)

    with perf_profile(db, 'test_lithops_annotate', ds.id) as perf:
        # Overwrite executor's NullProfiler with a real profiler
        executor._perf = perf
        job = ServerAnnotationJob(executor=executor, ds=ds, perf=perf)
        job.run(debug_validate=True)

    def db_df(sql, args):
        return pd.DataFrame(db.select_with_fields(sql, args))

    jobs = db_df('SELECT * FROM job WHERE ds_id = %s', (ds.id, ))
    anns = db_df(
        'SELECT * FROM annotation WHERE job_id = ANY(%s) ORDER BY msm DESC',
        (jobs.id.tolist(), ))
    diags = db_df('SELECT * FROM dataset_diagnostic WHERE ds_id = %s',
                  (ds.id, ))
    profiles = db_df('SELECT * FROM perf_profile WHERE ds_id = %s', (ds.id, ))
    profile_entries = db_df(
        'SELECT * FROM perf_profile_entry WHERE profile_id = ANY(%s)',
        (profiles.id.tolist(), ))
    # For debugging annotations / FDR-related issues
    debug_data = job.pipe.debug_get_annotation_data(MOCK_FORMULAS[0], '')
    # print(debug_data)
    # db_data = load_cobjs(executor.storage, job.pipe.db_data_cobjs)[0]
    # print(db_data)
    # print(load_cobjs(executor.storage, job.pipe.ds_segms_cobjs))
    # moldb = pd.concat(load_cobjs(executor.storage, job.pipe.db_segms_cobjs))
    # formula_mzs = moldb.groupby('formula_i').mz.apply(list)
    # all_metrics = (
    #     job.pipe.formula_metrics_df.join(db_data['formula_map_df'].set_index('formula_i'))
    #     .join(formula_mzs)
    #     .sort_values('msm', ascending=False)
    # )
    # print(all_metrics)
    # print(job.pipe.ds_segments_bounds)
    # print(job.pipe.ds_segm_lens)
    # print(job.pipe.fdrs)

    # print(pd.DataFrame(anns))
    # print(pd.DataFrame(diags))
    # print(pd.DataFrame(profiles))
    # print(pd.DataFrame(profile_entries))

    # Validate jobs
    assert len(jobs) == 1
    assert jobs.moldb_id[0] == moldb_id

    # Validate annotations
    assert np.array_equal(anns.formula,
                          MOCK_FORMULAS)  # Formulas should be MSM-descending
    assert np.array_equal(anns.fdr, [0.05] * 2 + [0.5] * 8)

    # Validate images were saved
    image_ids = [imgs[0] for imgs in anns.iso_image_ids]
    images = image_storage.get_ion_images_for_analysis(ds.id, image_ids)[0]
    assert images.shape == (len(anns), 4 * 4)
    # All non-masked pixels should have a value
    assert np.count_nonzero(images) == len(anns) * len(MOCK_COORDS)

    # Validate diagnostics
    metadata_diag = diags[diags.type == DiagnosticType.IMZML_METADATA].iloc[0]
    tic_diag = diags[diags.type == DiagnosticType.TIC].iloc[0]

    assert metadata_diag.error is None
    assert metadata_diag.data['n_spectra'] == len(MOCK_COORDS)
    assert metadata_diag.images[0]['key'] == DiagnosticImageKey.MASK
    mask_image = load_npy_image(ds.id, metadata_diag.images[0]['image_id'])
    assert np.count_nonzero(mask_image) == len(MOCK_COORDS)

    assert tic_diag.error is None
    assert tic_diag.data['min_tic'] > 0
    assert tic_diag.images[0]['key'] == DiagnosticImageKey.TIC
    tic_image = load_npy_image(ds.id, tic_diag.images[0]['image_id'])
    assert tic_image.dtype == np.float32
    assert tic_image.shape == (4, 4)
    assert np.array_equal(np.isnan(tic_image),
                          ~mask_image)  # Masked area should be NaNs
    assert (tic_image[mask_image] >
            0).all()  # Non-masked area should be non-zero

    # Validate perf profile
    assert len(profiles) == 1
    assert len(profile_entries) > 10