def reprocess_dataset_local(sm_src, src_ds_id, dst_ds_id, update_metadata_func, skip_existing=True, use_cache=False): existing = get_dataset_diagnostics(dst_ds_id) if skip_existing and existing: print(f'Skipping {dst_ds_id}\n', end=None) return dst_ds_id, None smds = sm_src.dataset(id=src_ds_id) db = DB() ds_metadata, ds_config = update_metadata_func(smds.metadata, smds.config) ds = Dataset( id=dst_ds_id, name=smds.name, input_path=smds.s3dir, upload_dt=datetime.now(), metadata=ds_metadata, config=ds_config, status=DatasetStatus.QUEUED, status_update_dt=None, is_public=False, ) ds.save(db, None, True) with perf_profile(db, 'annotate_lithops', dst_ds_id) as perf: executor = Executor(SMConfig.get_conf()['lithops'], perf=perf) job = ServerAnnotationJob(executor, ds, perf, use_cache=use_cache) job.pipe.use_db_cache = False job.run() return dst_ds_id
def annotate_lithops(self, ds: Dataset, del_first=False): if del_first: self.logger.warning(f'Deleting all results for dataset: {ds.id}') del_jobs(ds) ds.save(self._db, self._es) with perf_profile(self._db, 'annotate_lithops', ds.id) as perf: executor = Executor(self._sm_config['lithops'], perf=perf) ServerAnnotationJob(executor, ds, perf).run() if self._sm_config['services'].get('colocalization', True): Colocalization(self._db).run_coloc_job_lithops(executor, ds, reprocess=del_first) if self._sm_config['services'].get('ion_thumbnail', True): generate_ion_thumbnail_lithops( executor=executor, db=self._db, ds=ds, only_if_needed=not del_first, )
def add(self, doc, use_lithops, **kwargs): """Save dataset and send ANNOTATE message to the queue.""" now = datetime.now() if 'id' not in doc: doc['id'] = now.strftime('%Y-%m-%d_%Hh%Mm%Ss') ds_config_kwargs = dict( (k, v) for k, v in doc.items() if k in FLAT_DS_CONFIG_KEYS) try: ds = Dataset.load(self._db, doc['id']) self._set_ds_busy(ds, kwargs.get('force', False)) config = update_ds_config(ds.config, doc['metadata'], **ds_config_kwargs) except UnknownDSID: config = generate_ds_config(doc.get('metadata'), **ds_config_kwargs) ds = Dataset( id=doc['id'], name=doc.get('name'), input_path=doc.get('input_path'), upload_dt=doc.get('upload_dt', now.isoformat()), metadata=doc.get('metadata'), config=config, is_public=doc.get('is_public'), status=DatasetStatus.QUEUED, ) ds.save(self._db, self._es, allow_insert=True) self._status_queue.publish({ 'ds_id': ds.id, 'action': DaemonAction.ANNOTATE, 'stage': DaemonActionStage.QUEUED }) queue = self._lit_queue if use_lithops else self._annot_queue self._post_sm_msg(ds=ds, queue=queue, action=DaemonAction.ANNOTATE, **kwargs) return doc['id']
def test_dataset_save_overwrite_ds_works(fill_db, sm_config, ds_config): db = DB(sm_config['db']) es_mock = MagicMock(spec=ESExporter) status_queue_mock = MagicMock(spec=QueuePublisher) upload_dt = datetime.now() ds_id = '2000-01-01' ds = Dataset(ds_id, 'ds_name', 'input_path', upload_dt, {}, ds_config, mol_dbs=['HMDB'], adducts=['+H']) ds.save(db, es_mock, status_queue_mock) assert ds == Dataset.load(db, ds_id) es_mock.sync_dataset.assert_called_once_with(ds_id) status_queue_mock.publish.assert_called_with({ 'ds_id': ds_id, 'status': DatasetStatus.NEW })
def create_test_ds( id='2000-01-01', name='ds_name', input_path='input_path', upload_dt=None, metadata=None, config=None, status=DatasetStatus.FINISHED, es=None, ): upload_dt = upload_dt or datetime.now() ds = Dataset( id=id, name=name, input_path=input_path, upload_dt=upload_dt or datetime.now(), metadata=metadata or deepcopy(TEST_METADATA), config=config or deepcopy(TEST_DS_CONFIG), status=status or DatasetStatus.QUEUED, ) ds.save(DB(), es=es, allow_insert=True) return ds
def test_server_annotation_job(test_db, executor: Executor, sm_config, ds_config, metadata): db = DB() moldb_id = import_test_molecular_db() ds_config['database_ids'] = [moldb_id] ds_config['isotope_generation']['adducts'] = [ '[M]+' ] # test spectrum was made with no adduct # ds_config['isotope_generation']['n_peaks'] = 2 # minimize overlap between decoys and targets ds_config['image_generation'][ 'ppm'] = 0.001 # minimize overlap between decoys and targets ds_config['fdr']['decoy_sample_size'] = len(MOCK_DECOY_ADDUCTS) input_path = upload_test_imzml(executor.storage, sm_config, ds_config) ds = Dataset( id=datetime.now().strftime('%Y-%m-%d_%Hh%Mm%Ss'), name='Test Lithops Dataset', input_path=input_path, upload_dt=datetime.now(), metadata=metadata, config=ds_config, is_public=True, status=DatasetStatus.QUEUED, ) ds.save(db, None, allow_insert=True) with perf_profile(db, 'test_lithops_annotate', ds.id) as perf: # Overwrite executor's NullProfiler with a real profiler executor._perf = perf job = ServerAnnotationJob(executor=executor, ds=ds, perf=perf) job.run(debug_validate=True) def db_df(sql, args): return pd.DataFrame(db.select_with_fields(sql, args)) jobs = db_df('SELECT * FROM job WHERE ds_id = %s', (ds.id, )) anns = db_df( 'SELECT * FROM annotation WHERE job_id = ANY(%s) ORDER BY msm DESC', (jobs.id.tolist(), )) diags = db_df('SELECT * FROM dataset_diagnostic WHERE ds_id = %s', (ds.id, )) profiles = db_df('SELECT * FROM perf_profile WHERE ds_id = %s', (ds.id, )) profile_entries = db_df( 'SELECT * FROM perf_profile_entry WHERE profile_id = ANY(%s)', (profiles.id.tolist(), )) # For debugging annotations / FDR-related issues debug_data = job.pipe.debug_get_annotation_data(MOCK_FORMULAS[0], '') # print(debug_data) # db_data = load_cobjs(executor.storage, job.pipe.db_data_cobjs)[0] # print(db_data) # print(load_cobjs(executor.storage, job.pipe.ds_segms_cobjs)) # moldb = pd.concat(load_cobjs(executor.storage, job.pipe.db_segms_cobjs)) # formula_mzs = moldb.groupby('formula_i').mz.apply(list) # all_metrics = ( # job.pipe.formula_metrics_df.join(db_data['formula_map_df'].set_index('formula_i')) # .join(formula_mzs) # .sort_values('msm', ascending=False) # ) # print(all_metrics) # print(job.pipe.ds_segments_bounds) # print(job.pipe.ds_segm_lens) # print(job.pipe.fdrs) # print(pd.DataFrame(anns)) # print(pd.DataFrame(diags)) # print(pd.DataFrame(profiles)) # print(pd.DataFrame(profile_entries)) # Validate jobs assert len(jobs) == 1 assert jobs.moldb_id[0] == moldb_id # Validate annotations assert np.array_equal(anns.formula, MOCK_FORMULAS) # Formulas should be MSM-descending assert np.array_equal(anns.fdr, [0.05] * 2 + [0.5] * 8) # Validate images were saved image_ids = [imgs[0] for imgs in anns.iso_image_ids] images = image_storage.get_ion_images_for_analysis(ds.id, image_ids)[0] assert images.shape == (len(anns), 4 * 4) # All non-masked pixels should have a value assert np.count_nonzero(images) == len(anns) * len(MOCK_COORDS) # Validate diagnostics metadata_diag = diags[diags.type == DiagnosticType.IMZML_METADATA].iloc[0] tic_diag = diags[diags.type == DiagnosticType.TIC].iloc[0] assert metadata_diag.error is None assert metadata_diag.data['n_spectra'] == len(MOCK_COORDS) assert metadata_diag.images[0]['key'] == DiagnosticImageKey.MASK mask_image = load_npy_image(ds.id, metadata_diag.images[0]['image_id']) assert np.count_nonzero(mask_image) == len(MOCK_COORDS) assert tic_diag.error is None assert tic_diag.data['min_tic'] > 0 assert tic_diag.images[0]['key'] == DiagnosticImageKey.TIC tic_image = load_npy_image(ds.id, tic_diag.images[0]['image_id']) assert tic_image.dtype == np.float32 assert tic_image.shape == (4, 4) assert np.array_equal(np.isnan(tic_image), ~mask_image) # Masked area should be NaNs assert (tic_image[mask_image] > 0).all() # Non-masked area should be non-zero # Validate perf profile assert len(profiles) == 1 assert len(profile_entries) > 10