def test_estimate_fdr_digitize_works(): fdr = FDR(job_id=0, decoy_sample_size=1, target_adducts=['+H'], db=None) fdr.fdr_levels = [0.4, 0.8] fdr.td_df = pd.DataFrame([['C1', '+H', '+Cu'], ['C2', '+H', '+Ag'], ['C3', '+H', '+Cl'], ['C4', '+H', '+Co']], columns=['sf', 'ta', 'da']) msm_df = pd.DataFrame([['C1', '+H', 1.0], ['C2', '+H', 0.75], ['C3', '+H', 0.5], ['C4', '+H', 0.25], ['C1', '+Cu', 0.75], ['C2', '+Ag', 0.3], ['C3', '+Cl', 0.25], ['C4', '+Co', 0.1]], columns=['sf', 'adduct', 'msm']).set_index(['sf', 'adduct']).sort_index() exp_sf_df = pd.DataFrame([['C1', '+H', 0.4], ['C2', '+H', 0.4], ['C3', '+H', 0.4], ['C4', '+H', 0.8]], columns=['sf', 'adduct', 'fdr']).set_index(['sf', 'adduct']) assert_frame_equal(fdr.estimate_fdr(msm_df), exp_sf_df)
def _run_annotation_job(self, mol_db): try: self.store_job_meta(mol_db.id) mol_db.set_job_id(self._job_id) logger.info( "Running new job ds_id: %s, ds_name: %s, db_name: %s, db_version: %s", self._ds.id, self._ds.name, mol_db.name, mol_db.version) target_adducts = self._ds.config['isotope_generation']['adducts'] self._fdr = FDR(job_id=self._job_id, decoy_sample_size=20, target_adducts=target_adducts, db=self._db) isocalc = IsocalcWrapper(self._ds.config['isotope_generation']) centroids_gen = IonCentroidsGenerator(sc=self._sc, moldb_name=mol_db.name, isocalc=isocalc) polarity = self._ds.config['isotope_generation']['charge'][ 'polarity'] all_adducts = list( set(self._sm_config['defaults']['adducts'][polarity]) | set(DECOY_ADDUCTS)) centroids_gen.generate_if_not_exist(isocalc=isocalc, sfs=mol_db.sfs, adducts=all_adducts) target_ions = centroids_gen.ions(target_adducts) self._fdr.decoy_adducts_selection(target_ions) search_alg = MSMBasicSearch(sc=self._sc, ds=self._ds, ds_reader=self._ds_reader, mol_db=mol_db, centr_gen=centroids_gen, fdr=self._fdr, ds_config=self._ds.config) ion_metrics_df, ion_iso_images = search_alg.search() search_results = SearchResults(mol_db.id, self._job_id, search_alg.metrics.keys()) mask = self._ds_reader.get_2d_sample_area_mask() img_store_type = self._ds.get_ion_img_storage_type(self._db) search_results.store(ion_metrics_df, ion_iso_images, mask, self._db, self._img_store, img_store_type) except Exception as e: self._db.alter( JOB_UPD_STATUS_FINISH, params=(JobStatus.FAILED, datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id)) msg = 'Job failed(ds_id={}, mol_db={}): {}'.format( self._ds.id, mol_db, str(e)) raise JobFailedError(msg) from e else: self._db.alter( JOB_UPD_STATUS_FINISH, params=(JobStatus.FINISHED, datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id))
def test_fdr_decoy_adduct_selection_saves_corr(): db_mock = MagicMock(DB) db_mock.select.return_value = [(1,)] fdr = FDR(job_id=0, decoy_sample_size=2, target_adducts=['+H', '+K'], db=db_mock) exp_target_decoy_df = pd.DataFrame([('H2O', '+H', '+He'), ('H2O', '+H', '+Li'), ('H2O', '+K', '+He'), ('H2O', '+K', '+Li')], columns=['sf', 'ta', 'da']) fdr.decoy_adducts_selection(target_ions=[('H2O', '+H'), ('H2O', '+K')]) assert_frame_equal(fdr.td_df.sort_values(by=['sf', 'ta', 'da']).reset_index(drop=True), exp_target_decoy_df.sort_values(by=['sf', 'ta', 'da']).reset_index(drop=True))
def test_fdr_decoy_adduct_selection_saves_corr(): db_mock = MagicMock(DB) db_mock.select.return_value = [(1, )] fdr = FDR(0, 0, 2, ['+H', '+K'], db_mock) def assert_df_values_equal(self, other): assert set(self) == set(other) exp_target_decoy_df = pd.DataFrame([(1, '+H', '+He'), (1, '+H', '+Li'), (1, '+K', '+He'), (1, '+K', '+Li')], columns=['sf_id', 'ta', 'da']) fdr._save_target_decoy_df = MagicMock( side_effect=lambda: assert_df_values_equal(exp_target_decoy_df, fdr. td_df)) fdr.decoy_adduct_selection()
def test_estimate_fdr_digitize_works(): fdr = FDR(0, 0, 1, ['+H'], None) fdr.fdr_levels = [0.4, 0.8] fdr.td_df = pd.DataFrame([[1, '+H', '+Cu'], [2, '+H', '+Ag'], [3, '+H', '+Cl'], [4, '+H', '+Co']], columns=['sf_id', 'ta', 'da']) msm_df = pd.DataFrame( [[1, '+H', 1.0], [2, '+H', 0.75], [3, '+H', 0.5], [4, '+H', 0.25], [1, '+Cu', 0.75], [2, '+Ag', 0.3], [3, '+Cl', 0.25], [4, '+Co', 0.1]], columns=['sf_id', 'adduct', 'msm']).set_index(['sf_id', 'adduct']).sort_index() exp_sf_df = pd.DataFrame( [[1, '+H', 0.4], [2, '+H', 0.4], [3, '+H', 0.4], [4, '+H', 0.8]], columns=['sf_id', 'adduct', 'fdr']).set_index(['sf_id', 'adduct']) assert_frame_equal(fdr.estimate_fdr(msm_df), exp_sf_df)
def test_estimate_fdr_returns_correct_df(): fdr = FDR(0, 0, 2, ['+H'], None) fdr.fdr_levels = [0.2, 0.8] fdr.td_df = pd.DataFrame([[1, '+H', '+Cu'], [1, '+H', '+Co'], [2, '+H', '+Ag'], [2, '+H', '+Ar']], columns=['sf_id', 'ta', 'da']) msm_df = pd.DataFrame([[1, '+H', 0.85], [2, '+H', 0.5], [1, '+Cu', 0.5], [1, '+Co', 0.5], [2, '+Ag', 0.75], [2, '+Ar', 0.0]], columns=['sf_id', 'adduct', 'msm']).set_index(['sf_id', 'adduct']).sort_index() exp_sf_df = pd.DataFrame([[1, '+H', 0.2], [2, '+H', 0.8]], columns=['sf_id', 'adduct', 'fdr']).set_index(['sf_id', 'adduct']) assert_frame_equal(fdr.estimate_fdr(msm_df), exp_sf_df)
def test_estimate_fdr_returns_correct_df(): fdr = FDR(job_id=0, decoy_sample_size=2, target_adducts=['+H'], db=None) fdr.fdr_levels = [0.2, 0.8] fdr.td_df = pd.DataFrame([['H2O', '+H', '+Cu'], ['H2O', '+H', '+Co'], ['C2H2', '+H', '+Ag'], ['C2H2', '+H', '+Ar']], columns=['sf', 'ta', 'da']) msm_df = pd.DataFrame( [['H2O', '+H', 0.85], ['C2H2', '+H', 0.5], ['H2O', '+Cu', 0.5], ['H2O', '+Co', 0.5], ['C2H2', '+Ag', 0.75], ['C2H2', '+Ar', 0.0]], columns=['sf', 'adduct', 'msm']).set_index(['sf', 'adduct']).sort_index() exp_sf_df = pd.DataFrame([['H2O', '+H', 0.2], ['C2H2', '+H', 0.8]], columns=['sf', 'adduct', 'fdr']).set_index(['sf', 'adduct']) assert_frame_equal(fdr.estimate_fdr(msm_df), exp_sf_df)
def test_ions(): sfs = ['H2O', 'C5H2OH'] target_adducts = ['+H', '+Na'] decoy_sample_size = 5 fdr = FDR(job_id=0, decoy_sample_size=decoy_sample_size, target_adducts=target_adducts, db=None) fdr.decoy_adducts_selection(target_ions=[('H2O', '+H'), ('H2O', '+Na'), ('C5H2OH', '+H'), ('C5H2OH', '+Na')]) ions = fdr.ion_tuples() assert type(ions) == list # total number varies because different (sf, adduct) pairs may receive the same (sf, decoy_adduct) pair assert len(sfs) * decoy_sample_size + len(sfs) * len(target_adducts) < \ len(ions) <= \ len(sfs) * len(target_adducts) * decoy_sample_size + len(sfs) * len(target_adducts) target_ions = [(sf, adduct) for sf, adduct in product(sfs, target_adducts)] assert set(target_ions).issubset(set(map(tuple, ions)))
def test_estimate_fdr_returns_correct_df(): fdr = FDR(job_id=0, decoy_sample_size=2, target_adducts=['+H'], db=None) fdr.fdr_levels = [0.2, 0.8] fdr.td_df = pd.DataFrame([['H2O', '+H', '+Cu'], ['H2O', '+H', '+Co'], ['C2H2', '+H', '+Ag'], ['C2H2', '+H', '+Ar']], columns=['sf', 'ta', 'da']) msm_df = pd.DataFrame([['H2O', '+H', 0.85], ['C2H2', '+H', 0.5], ['H2O', '+Cu', 0.5], ['H2O', '+Co', 0.5], ['C2H2', '+Ag', 0.75], ['C2H2', '+Ar', 0.0]], columns=['sf', 'adduct', 'msm']).set_index(['sf', 'adduct']).sort_index() exp_sf_df = pd.DataFrame([['H2O', '+H', 0.2], ['C2H2', '+H', 0.8]], columns=['sf', 'adduct', 'fdr']).set_index(['sf', 'adduct']) assert_frame_equal(fdr.estimate_fdr(msm_df), exp_sf_df)
def _run_annotation_job(self, mol_db): try: self.store_job_meta(mol_db.id) mol_db.set_job_id(self._job_id) logger.info("Running new job ds_id: %s, ds_name: %s, db_name: %s, db_version: %s", self._ds.id, self._ds.name, mol_db.name, mol_db.version) target_adducts = self._ds.config['isotope_generation']['adducts'] self._fdr = FDR(job_id=self._job_id, decoy_sample_size=20, target_adducts=target_adducts, db=self._db) isocalc = IsocalcWrapper(self._ds.config['isotope_generation']) centroids_gen = IonCentroidsGenerator(sc=self._sc, moldb_name=mol_db.name, isocalc=isocalc) polarity = self._ds.config['isotope_generation']['charge']['polarity'] all_adducts = list(set(self._sm_config['defaults']['adducts'][polarity]) | set(DECOY_ADDUCTS)) centroids_gen.generate_if_not_exist(isocalc=isocalc, sfs=mol_db.sfs, adducts=all_adducts) target_ions = centroids_gen.ions(target_adducts) self._fdr.decoy_adducts_selection(target_ions) search_alg = MSMBasicSearch(sc=self._sc, ds=self._ds, ds_reader=self._ds_reader, mol_db=mol_db, centr_gen=centroids_gen, fdr=self._fdr, ds_config=self._ds.config) ion_metrics_df, ion_iso_images = search_alg.search() search_results = SearchResults(mol_db.id, self._job_id, search_alg.metrics.keys()) mask = self._ds_reader.get_2d_sample_area_mask() img_store_type = self._ds.get_ion_img_storage_type(self._db) search_results.store(ion_metrics_df, ion_iso_images, mask, self._db, self._img_store, img_store_type) except Exception as e: self._db.alter(JOB_UPD, params=('FAILED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id)) msg = 'Job failed(ds_id={}, mol_db={}): {}'.format(self._ds.id, mol_db, str(e)) raise JobFailedError(msg) from e else: self._export_search_results_to_es(mol_db, isocalc)
def run(self, input_path, ds_config_path, clean=False): """ Entry point of the engine. Molecule search is completed in several steps: * Copying input data to the engine work dir * Conversion input data (imzML+ibd) to plain text format. One line - one spectrum data * Generation and saving to the database theoretical peaks for all formulas from the molecule database * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner. * Saving results (isotope images and their metrics of quality for each putative molecule) to the database Args ------- input_path : string Path to the dataset folder with .imzML and .ibd files ds_config_path: string Path to the dataset config file clean : bool Clean all interim data files before starting molecule search """ try: self.wd_manager = WorkDirManager(self.ds_name) if clean: self.wd_manager.clean() self.wd_manager.copy_input_data(input_path, ds_config_path) self._read_ds_config() logger.info('Dataset config:\n%s', pformat(self.ds_config)) self._configure_spark() self._init_db() if not self.wd_manager.exists(self.wd_manager.txt_path): imzml_converter = ImzmlTxtConverter( self.ds_name, self.wd_manager.local_dir.imzml_path, self.wd_manager.local_dir.txt_path, self.wd_manager.local_dir.coord_path) imzml_converter.convert() if not self.wd_manager.local_fs_only: self.wd_manager.upload_to_remote() self.ds = Dataset(self.sc, self.ds_name, self.client_email, input_path, self.ds_config, self.wd_manager, self.db) self.ds.save_ds_meta() self.store_job_meta() theor_peaks_gen = TheorPeaksGenerator(self.sc, self.sm_config, self.ds_config) theor_peaks_gen.run() target_adducts = self.ds_config['isotope_generation']['adducts'] self.fdr = FDR(self.job_id, self.sf_db_id, decoy_sample_size=20, target_adducts=target_adducts, db=self.db) self.fdr.decoy_adduct_selection() self.formulas = FormulasSegm(self.job_id, self.sf_db_id, self.ds_config, self.db) # search_alg = MSMBasicSearch(self.sc, self.ds, self.formulas, self.fdr, self.ds_config) search_alg = MSMExtraFeats(self.sc, self.ds, self.formulas, self.fdr, self.ds_config) sf_metrics_df, sf_iso_images = search_alg.search() search_results = SearchResults( self.sf_db_id, self.ds_id, self.job_id, self.ds_name, self.formulas.get_sf_adduct_peaksn(), self.db, self.sm_config, self.ds_config) search_results.sf_metrics_df = sf_metrics_df search_results.sf_iso_images = sf_iso_images search_results.metrics = search_alg.metrics search_results.nrows, search_results.ncols = self.ds.get_dims() search_results.store() es = ESExporter(self.sm_config) es.index_ds(self.db, self.ds_name, self.ds_config['database']['name']) except Exception: exc_type, exc_value, exc_traceback = sys.exc_info() logger.error('\n'.join( traceback.format_exception(exc_type, exc_value, exc_traceback))) finally: if self.sc: # self.sc.show_profiles() self.sc.stop() if self.db: self.db.close()
class SearchJob(object): """ Main class responsible for molecule search. Uses other modules of the engine. Args ---- no_clean : bool Don't delete interim data files """ def __init__(self, img_store=None, no_clean=False): self.no_clean = no_clean self._img_store = img_store self._job_id = None self._sc = None self._db = None self._ds = None self._ds_reader = None self._status_queue = None self._fdr = None self._wd_manager = None self._es = None self._sm_config = SMConfig.get_conf() logger.debug('Using SM config:\n%s', pformat(self._sm_config)) def _configure_spark(self): logger.info('Configuring Spark') sconf = SparkConf() for prop, value in self._sm_config['spark'].items(): if prop.startswith('spark.'): sconf.set(prop, value) if 'aws' in self._sm_config: sconf.set("spark.hadoop.fs.s3a.access.key", self._sm_config['aws']['aws_access_key_id']) sconf.set("spark.hadoop.fs.s3a.secret.key", self._sm_config['aws']['aws_secret_access_key']) sconf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") sconf.set("spark.hadoop.fs.s3a.endpoint", "s3.{}.amazonaws.com".format(self._sm_config['aws']['aws_region'])) self._sc = SparkContext(master=self._sm_config['spark']['master'], conf=sconf, appName='SM engine') def _init_db(self): logger.info('Connecting to the DB') self._db = DB(self._sm_config['db']) def store_job_meta(self, mol_db_id): """ Store search job metadata in the database """ logger.info('Storing job metadata') rows = [(mol_db_id, self._ds.id, 'STARTED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))] self._job_id = self._db.insert_return(JOB_INS, rows=rows)[0] def _run_annotation_job(self, mol_db): try: self.store_job_meta(mol_db.id) mol_db.set_job_id(self._job_id) logger.info("Running new job ds_id: %s, ds_name: %s, db_name: %s, db_version: %s", self._ds.id, self._ds.name, mol_db.name, mol_db.version) target_adducts = self._ds.config['isotope_generation']['adducts'] self._fdr = FDR(job_id=self._job_id, decoy_sample_size=20, target_adducts=target_adducts, db=self._db) isocalc = IsocalcWrapper(self._ds.config['isotope_generation']) centroids_gen = IonCentroidsGenerator(sc=self._sc, moldb_name=mol_db.name, isocalc=isocalc) polarity = self._ds.config['isotope_generation']['charge']['polarity'] all_adducts = list(set(self._sm_config['defaults']['adducts'][polarity]) | set(DECOY_ADDUCTS)) centroids_gen.generate_if_not_exist(isocalc=isocalc, sfs=mol_db.sfs, adducts=all_adducts) target_ions = centroids_gen.ions(target_adducts) self._fdr.decoy_adducts_selection(target_ions) search_alg = MSMBasicSearch(sc=self._sc, ds=self._ds, ds_reader=self._ds_reader, mol_db=mol_db, centr_gen=centroids_gen, fdr=self._fdr, ds_config=self._ds.config) ion_metrics_df, ion_iso_images = search_alg.search() search_results = SearchResults(mol_db.id, self._job_id, search_alg.metrics.keys()) mask = self._ds_reader.get_2d_sample_area_mask() img_store_type = self._ds.get_ion_img_storage_type(self._db) search_results.store(ion_metrics_df, ion_iso_images, mask, self._db, self._img_store, img_store_type) except Exception as e: self._db.alter(JOB_UPD, params=('FAILED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id)) msg = 'Job failed(ds_id={}, mol_db={}): {}'.format(self._ds.id, mol_db, str(e)) raise JobFailedError(msg) from e else: self._export_search_results_to_es(mol_db, isocalc) def _export_search_results_to_es(self, mol_db, isocalc): try: self._es.index_ds(self._ds.id, mol_db, isocalc) except Exception as e: self._db.alter(JOB_UPD, params=('FAILED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id)) msg = 'Export to ES failed(ds_id={}, mol_db={}): {}'.format(self._ds.id, mol_db, str(e)) raise ESExportFailedError(msg) from e else: self._db.alter(JOB_UPD, params=('FINISHED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id)) def _remove_annotation_job(self, mol_db): logger.info("Removing job results ds_id: %s, ds_name: %s, db_name: %s, db_version: %s", self._ds.id, self._ds.name, mol_db.name, mol_db.version) self._db.alter('DELETE FROM job WHERE ds_id = %s and db_id = %s', params=(self._ds.id, mol_db.id)) self._es.delete_ds(self._ds.id, mol_db) def _moldb_ids(self): moldb_service = MolDBServiceWrapper(self._sm_config['services']['mol_db']) completed_moldb_ids = {moldb_service.find_db_by_id(db_id)['id'] for (_, db_id) in self._db.select(JOB_ID_MOLDB_ID_SEL, params=(self._ds.id,))} new_moldb_ids = {moldb_service.find_db_by_name_version(moldb_name)[0]['id'] for moldb_name in self._ds.config['databases']} return completed_moldb_ids, new_moldb_ids def _save_data_from_raw_ms_file(self): ms_file_type_config = SMConfig.get_ms_file_handler(self._wd_manager.local_dir.ms_file_path) acq_geometry_factory_module = ms_file_type_config['acq_geometry_factory'] acq_geometry_factory = getattr(import_module(acq_geometry_factory_module['path']), acq_geometry_factory_module['name']) acq_geometry = acq_geometry_factory(self._wd_manager.local_dir.ms_file_path).create() self._ds.save_acq_geometry(self._db, acq_geometry) self._ds.save_ion_img_storage_type(self._db, ms_file_type_config['img_storage_type']) def run(self, ds): """ Entry point of the engine. Molecule search is completed in several steps: * Copying input data to the engine work dir * Conversion input mass spec files to plain text format. One line - one spectrum data * Generation and saving to the database theoretical peaks for all formulas from the molecule database * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner. * Saving results (isotope images and their metrics of quality for each putative molecule) to the database Args ---- ds : sm.engine.dataset_manager.Dataset """ try: start = time.time() self._init_db() self._es = ESExporter(self._db) self._ds = ds if self._sm_config['rabbitmq']: self._status_queue = QueuePublisher(config=self._sm_config['rabbitmq'], qdesc=SM_DS_STATUS, logger=logger) else: self._status_queue = None ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.STARTED) self._wd_manager = WorkDirManager(ds.id) self._configure_spark() if not self.no_clean: self._wd_manager.clean() self._ds_reader = DatasetReader(self._ds.input_path, self._sc, self._wd_manager) self._ds_reader.copy_convert_input_data() self._save_data_from_raw_ms_file() self._img_store.storage_type = self._ds.get_ion_img_storage_type(self._db) ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.STARTED) logger.info('Dataset config:\n%s', pformat(self._ds.config)) completed_moldb_ids, new_moldb_ids = self._moldb_ids() for moldb_id in completed_moldb_ids.symmetric_difference(new_moldb_ids): # ignore ids present in both sets mol_db = MolecularDB(id=moldb_id, db=self._db, iso_gen_config=self._ds.config['isotope_generation']) if moldb_id not in new_moldb_ids: self._remove_annotation_job(mol_db) elif moldb_id not in completed_moldb_ids: self._run_annotation_job(mol_db) ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.FINISHED) logger.info("All done!") time_spent = time.time() - start logger.info('Time spent: %d mins %d secs', *divmod(int(round(time_spent)), 60)) except Exception as e: if self._ds: ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.FAILED) logger.error(e, exc_info=True) raise finally: if self._sc: self._sc.stop() if self._db: self._db.close() if self._wd_manager and not self.no_clean: self._wd_manager.clean() logger.info('*' * 150)
class SearchJob(object): """ Main class responsible for molecule search. Uses other modules of the engine. Args ---- no_clean : bool Don't delete interim data files """ def __init__(self, img_store=None, no_clean=False): self.no_clean = no_clean self._img_store = img_store self._job_id = None self._sc = None self._db = None self._ds = None self._ds_reader = None self._status_queue = None self._fdr = None self._wd_manager = None self._es = None self._sm_config = SMConfig.get_conf() logger.debug('Using SM config:\n%s', pformat(self._sm_config)) def _configure_spark(self): logger.info('Configuring Spark') sconf = SparkConf() for prop, value in self._sm_config['spark'].items(): if prop.startswith('spark.'): sconf.set(prop, value) if 'aws' in self._sm_config: sconf.set("spark.hadoop.fs.s3a.access.key", self._sm_config['aws']['aws_access_key_id']) sconf.set("spark.hadoop.fs.s3a.secret.key", self._sm_config['aws']['aws_secret_access_key']) sconf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") sconf.set( "spark.hadoop.fs.s3a.endpoint", "s3.{}.amazonaws.com".format( self._sm_config['aws']['aws_region'])) self._sc = SparkContext(master=self._sm_config['spark']['master'], conf=sconf, appName='SM engine') def _init_db(self): logger.info('Connecting to the DB') self._db = DB(self._sm_config['db']) def store_job_meta(self, mol_db_id): """ Store search job metadata in the database """ logger.info('Storing job metadata') rows = [(mol_db_id, self._ds.id, JobStatus.RUNNING, datetime.now().strftime('%Y-%m-%d %H:%M:%S'))] self._job_id = self._db.insert_return(JOB_INS, rows=rows)[0] def _run_annotation_job(self, mol_db): try: self.store_job_meta(mol_db.id) mol_db.set_job_id(self._job_id) logger.info( "Running new job ds_id: %s, ds_name: %s, db_name: %s, db_version: %s", self._ds.id, self._ds.name, mol_db.name, mol_db.version) target_adducts = self._ds.config['isotope_generation']['adducts'] self._fdr = FDR(job_id=self._job_id, decoy_sample_size=20, target_adducts=target_adducts, db=self._db) isocalc = IsocalcWrapper(self._ds.config['isotope_generation']) centroids_gen = IonCentroidsGenerator(sc=self._sc, moldb_name=mol_db.name, isocalc=isocalc) polarity = self._ds.config['isotope_generation']['charge'][ 'polarity'] all_adducts = list( set(self._sm_config['defaults']['adducts'][polarity]) | set(DECOY_ADDUCTS)) centroids_gen.generate_if_not_exist(isocalc=isocalc, sfs=mol_db.sfs, adducts=all_adducts) target_ions = centroids_gen.ions(target_adducts) self._fdr.decoy_adducts_selection(target_ions) search_alg = MSMBasicSearch(sc=self._sc, ds=self._ds, ds_reader=self._ds_reader, mol_db=mol_db, centr_gen=centroids_gen, fdr=self._fdr, ds_config=self._ds.config) ion_metrics_df, ion_iso_images = search_alg.search() search_results = SearchResults(mol_db.id, self._job_id, search_alg.metrics.keys()) mask = self._ds_reader.get_2d_sample_area_mask() img_store_type = self._ds.get_ion_img_storage_type(self._db) search_results.store(ion_metrics_df, ion_iso_images, mask, self._db, self._img_store, img_store_type) except Exception as e: self._db.alter( JOB_UPD_STATUS_FINISH, params=(JobStatus.FAILED, datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id)) msg = 'Job failed(ds_id={}, mol_db={}): {}'.format( self._ds.id, mol_db, str(e)) raise JobFailedError(msg) from e else: self._db.alter( JOB_UPD_STATUS_FINISH, params=(JobStatus.FINISHED, datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id)) def _remove_annotation_job(self, mol_db): logger.info( "Removing job results ds_id: %s, ds_name: %s, db_name: %s, db_version: %s", self._ds.id, self._ds.name, mol_db.name, mol_db.version) self._db.alter('DELETE FROM job WHERE ds_id = %s and db_id = %s', params=(self._ds.id, mol_db.id)) self._es.delete_ds(self._ds.id, mol_db) def _moldb_ids(self): completed_moldb_ids = { db_id for (_, db_id) in self._db.select(JOB_ID_MOLDB_ID_SEL, params=(self._ds.id, )) } new_moldb_ids = { MolecularDB(name=moldb_name).id for moldb_name in self._ds.config['databases'] } return completed_moldb_ids, new_moldb_ids def _save_data_from_raw_ms_file(self): ms_file_type_config = SMConfig.get_ms_file_handler( self._wd_manager.local_dir.ms_file_path) acq_geometry_factory_module = ms_file_type_config[ 'acq_geometry_factory'] acq_geometry_factory = getattr( import_module(acq_geometry_factory_module['path']), acq_geometry_factory_module['name']) acq_geometry = acq_geometry_factory( self._wd_manager.local_dir.ms_file_path).create() self._ds.save_acq_geometry(self._db, acq_geometry) self._ds.save_ion_img_storage_type( self._db, ms_file_type_config['img_storage_type']) def run(self, ds): """ Entry point of the engine. Molecule search is completed in several steps: * Copying input data to the engine work dir * Conversion input mass spec files to plain text format. One line - one spectrum data * Generation and saving to the database theoretical peaks for all formulas from the molecule database * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner. * Saving results (isotope images and their metrics of quality for each putative molecule) to the database Args ---- ds : sm.engine.dataset_manager.Dataset """ try: logger.info('*' * 150) start = time.time() self._init_db() self._es = ESExporter(self._db) self._ds = ds if self._sm_config['rabbitmq']: self._status_queue = QueuePublisher( config=self._sm_config['rabbitmq'], qdesc=SM_DS_STATUS, logger=logger) else: self._status_queue = None self._wd_manager = WorkDirManager(ds.id) self._configure_spark() if not self.no_clean: self._wd_manager.clean() self._ds_reader = DatasetReader(self._ds.input_path, self._sc, self._wd_manager) self._ds_reader.copy_convert_input_data() self._save_data_from_raw_ms_file() self._img_store.storage_type = self._ds.get_ion_img_storage_type( self._db) logger.info('Dataset config:\n%s', pformat(self._ds.config)) completed_moldb_ids, new_moldb_ids = self._moldb_ids() for moldb_id in completed_moldb_ids.symmetric_difference( new_moldb_ids): # ignore ids present in both sets mol_db = MolecularDB( id=moldb_id, db=self._db, iso_gen_config=self._ds.config['isotope_generation']) if moldb_id not in new_moldb_ids: self._remove_annotation_job(mol_db) elif moldb_id not in completed_moldb_ids: self._run_annotation_job(mol_db) logger.info("All done!") time_spent = time.time() - start logger.info('Time spent: %d mins %d secs', *divmod(int(round(time_spent)), 60)) finally: if self._sc: self._sc.stop() if self._db: self._db.close() if self._wd_manager and not self.no_clean: self._wd_manager.clean() logger.info('*' * 150)
class SearchJob(object): """ Main class responsible for molecule search. Uses other modules of the engine. Args ---------- ds_name : string A dataset short name """ def __init__(self, client_email, ds_name): self.sm_config = SMConfig.get_conf() self.client_email = client_email self.ds_name = ds_name self.ds_id = None self.job_id = None self.sc = None self.db = None self.ds = None self.fdr = None self.formulas = None self.ds_config = None self.wd_manager = None def _read_ds_config(self): with open(self.wd_manager.ds_config_path) as f: self.ds_config = json.load(f) def _configure_spark(self): logger.info('Configuring Spark') sconf = SparkConf() for prop, value in self.sm_config['spark'].iteritems(): if prop.startswith('spark.'): sconf.set(prop, value) if 'aws' in self.sm_config: sconf.set("spark.hadoop.fs.s3a.access.key", self.sm_config['aws']['aws_access_key_id']) sconf.set("spark.hadoop.fs.s3a.secret.key", self.sm_config['aws']['aws_secret_access_key']) sconf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") # sconf.set("spark.python.profile", "true") self.sc = SparkContext(master=self.sm_config['spark']['master'], conf=sconf, appName='SM engine') if not self.sm_config['spark']['master'].startswith('local'): self.sc.addPyFile(join(local_path(proj_root()), 'sm.zip')) def _init_db(self): logger.info('Connecting to the DB') self.db = DB(self.sm_config['db']) self.sf_db_id = self.db.select_one( DB_ID_SEL, self.ds_config['database']['name'])[0] def store_job_meta(self): """ Store search job metadata in the database """ logger.info('Storing job metadata') self.ds_id = int(self.db.select_one(DS_ID_SEL, self.ds_name)[0]) self.job_id = self.ds_id self.db.alter(DEL_JOB_SQL, self.job_id) rows = [(self.job_id, self.sf_db_id, self.ds_id, datetime.now().strftime('%Y-%m-%d %H:%M:%S'))] self.db.insert(JOB_INS, rows) rows = [(self.job_id, adduct) for adduct in self.ds_config['isotope_generation']['adducts']] self.db.insert(ADDUCT_INS, rows) def run(self, input_path, ds_config_path, clean=False): """ Entry point of the engine. Molecule search is completed in several steps: * Copying input data to the engine work dir * Conversion input data (imzML+ibd) to plain text format. One line - one spectrum data * Generation and saving to the database theoretical peaks for all formulas from the molecule database * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner. * Saving results (isotope images and their metrics of quality for each putative molecule) to the database Args ------- input_path : string Path to the dataset folder with .imzML and .ibd files ds_config_path: string Path to the dataset config file clean : bool Clean all interim data files before starting molecule search """ try: self.wd_manager = WorkDirManager(self.ds_name) if clean: self.wd_manager.clean() self.wd_manager.copy_input_data(input_path, ds_config_path) self._read_ds_config() logger.info('Dataset config:\n%s', pformat(self.ds_config)) self._configure_spark() self._init_db() if not self.wd_manager.exists(self.wd_manager.txt_path): imzml_converter = ImzmlTxtConverter( self.ds_name, self.wd_manager.local_dir.imzml_path, self.wd_manager.local_dir.txt_path, self.wd_manager.local_dir.coord_path) imzml_converter.convert() if not self.wd_manager.local_fs_only: self.wd_manager.upload_to_remote() self.ds = Dataset(self.sc, self.ds_name, self.client_email, input_path, self.ds_config, self.wd_manager, self.db) self.ds.save_ds_meta() self.store_job_meta() theor_peaks_gen = TheorPeaksGenerator(self.sc, self.sm_config, self.ds_config) theor_peaks_gen.run() target_adducts = self.ds_config['isotope_generation']['adducts'] self.fdr = FDR(self.job_id, self.sf_db_id, decoy_sample_size=20, target_adducts=target_adducts, db=self.db) self.fdr.decoy_adduct_selection() self.formulas = FormulasSegm(self.job_id, self.sf_db_id, self.ds_config, self.db) # search_alg = MSMBasicSearch(self.sc, self.ds, self.formulas, self.fdr, self.ds_config) search_alg = MSMExtraFeats(self.sc, self.ds, self.formulas, self.fdr, self.ds_config) sf_metrics_df, sf_iso_images = search_alg.search() search_results = SearchResults( self.sf_db_id, self.ds_id, self.job_id, self.ds_name, self.formulas.get_sf_adduct_peaksn(), self.db, self.sm_config, self.ds_config) search_results.sf_metrics_df = sf_metrics_df search_results.sf_iso_images = sf_iso_images search_results.metrics = search_alg.metrics search_results.nrows, search_results.ncols = self.ds.get_dims() search_results.store() es = ESExporter(self.sm_config) es.index_ds(self.db, self.ds_name, self.ds_config['database']['name']) except Exception: exc_type, exc_value, exc_traceback = sys.exc_info() logger.error('\n'.join( traceback.format_exception(exc_type, exc_value, exc_traceback))) finally: if self.sc: # self.sc.show_profiles() self.sc.stop() if self.db: self.db.close()