Esempio n. 1
0
def test_save_ds_meta_ds_doesnt_exist(spark_context, create_test_db, drop_test_db, sm_config, ds_config):
    work_dir_man_mock = MagicMock(WorkDirManager)
    work_dir_man_mock.ds_coord_path = '/ds_path'
    work_dir_man_mock.txt_path = '/txt_path'

    SMConfig._config_dict = sm_config

    with patch('sm.engine.tests.util.SparkContext.textFile') as m:
        m.return_value = spark_context.parallelize([
            '0,1,1\n',
            '1,100,200\n'])

        dataset = Dataset(spark_context, 'ds_name', '', ds_config, work_dir_man_mock, DB(sm_config['db']))
        dataset.save_ds_meta()

    db = DB(sm_config['db'])
    ds_row = db.select_one('SELECT name, file_path, img_bounds, config from dataset')
    assert ds_row == ('ds_name', '/txt_path',
                      {u'x': {u'min': 1, u'max': 100}, u'y': {u'min': 1, u'max': 200}},
                      ds_config)

    coord_row = db.select_one('SELECT xs, ys from coordinates')
    assert coord_row == ([1, 100], [1, 200])

    db.close()
Esempio n. 2
0
def test_save_ds_meta_ds_doesnt_exist(spark_context, create_test_db,
                                      drop_test_db, sm_config, ds_config):
    work_dir_man_mock = MagicMock(WorkDirManager)
    work_dir_man_mock.ds_coord_path = '/ds_path'
    work_dir_man_mock.txt_path = '/txt_path'

    SMConfig._config_dict = sm_config

    with patch('sm.engine.tests.util.SparkContext.textFile') as m:
        m.return_value = spark_context.parallelize(['0,1,1\n', '1,100,200\n'])

        dataset = Dataset(spark_context, 'ds_name', '', 'input_path',
                          ds_config, work_dir_man_mock, DB(sm_config['db']))
        dataset.save_ds_meta()

    db = DB(sm_config['db'])
    ds_row = db.select_one(
        'SELECT name, file_path, img_bounds, config from dataset')
    assert ds_row == ('ds_name', 'input_path', {
        u'x': {
            u'min': 1,
            u'max': 100
        },
        u'y': {
            u'min': 1,
            u'max': 200
        }
    }, ds_config)

    coord_row = db.select_one('SELECT xs, ys from coordinates')
    assert coord_row == ([1, 100], [1, 200])

    db.close()
Esempio n. 3
0
class SearchJob(object):
    """ Main class responsible for molecule search. Uses other modules of the engine.

    Args
    ----------
    ds_name : string
        A dataset short name
    """
    def __init__(self, client_email, ds_name):
        self.sm_config = SMConfig.get_conf()
        self.client_email = client_email
        self.ds_name = ds_name
        self.ds_id = None
        self.job_id = None
        self.sc = None
        self.db = None
        self.ds = None
        self.fdr = None
        self.formulas = None
        self.ds_config = None
        self.wd_manager = None

    def _read_ds_config(self):
        with open(self.wd_manager.ds_config_path) as f:
            self.ds_config = json.load(f)

    def _configure_spark(self):
        logger.info('Configuring Spark')
        sconf = SparkConf()
        for prop, value in self.sm_config['spark'].iteritems():
            if prop.startswith('spark.'):
                sconf.set(prop, value)

        if 'aws' in self.sm_config:
            sconf.set("spark.hadoop.fs.s3a.access.key",
                      self.sm_config['aws']['aws_access_key_id'])
            sconf.set("spark.hadoop.fs.s3a.secret.key",
                      self.sm_config['aws']['aws_secret_access_key'])
            sconf.set("spark.hadoop.fs.s3a.impl",
                      "org.apache.hadoop.fs.s3a.S3AFileSystem")

        # sconf.set("spark.python.profile", "true")
        self.sc = SparkContext(master=self.sm_config['spark']['master'],
                               conf=sconf,
                               appName='SM engine')
        if not self.sm_config['spark']['master'].startswith('local'):
            self.sc.addPyFile(join(local_path(proj_root()), 'sm.zip'))

    def _init_db(self):
        logger.info('Connecting to the DB')
        self.db = DB(self.sm_config['db'])
        self.sf_db_id = self.db.select_one(
            DB_ID_SEL, self.ds_config['database']['name'])[0]

    def store_job_meta(self):
        """ Store search job metadata in the database """
        logger.info('Storing job metadata')
        self.ds_id = int(self.db.select_one(DS_ID_SEL, self.ds_name)[0])
        self.job_id = self.ds_id
        self.db.alter(DEL_JOB_SQL, self.job_id)
        rows = [(self.job_id, self.sf_db_id, self.ds_id,
                 datetime.now().strftime('%Y-%m-%d %H:%M:%S'))]
        self.db.insert(JOB_INS, rows)

        rows = [(self.job_id, adduct)
                for adduct in self.ds_config['isotope_generation']['adducts']]
        self.db.insert(ADDUCT_INS, rows)

    def run(self, input_path, ds_config_path, clean=False):
        """ Entry point of the engine. Molecule search is completed in several steps:
         * Copying input data to the engine work dir
         * Conversion input data (imzML+ibd) to plain text format. One line - one spectrum data
         * Generation and saving to the database theoretical peaks for all formulas from the molecule database
         * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner.
         * Saving results (isotope images and their metrics of quality for each putative molecule) to the database

        Args
        -------
        input_path : string
            Path to the dataset folder with .imzML and .ibd files
        ds_config_path: string
            Path to the dataset config file
        clean : bool
            Clean all interim data files before starting molecule search
        """
        try:
            self.wd_manager = WorkDirManager(self.ds_name)
            if clean:
                self.wd_manager.clean()

            self.wd_manager.copy_input_data(input_path, ds_config_path)

            self._read_ds_config()
            logger.info('Dataset config:\n%s', pformat(self.ds_config))

            self._configure_spark()
            self._init_db()

            if not self.wd_manager.exists(self.wd_manager.txt_path):
                imzml_converter = ImzmlTxtConverter(
                    self.ds_name, self.wd_manager.local_dir.imzml_path,
                    self.wd_manager.local_dir.txt_path,
                    self.wd_manager.local_dir.coord_path)
                imzml_converter.convert()

                if not self.wd_manager.local_fs_only:
                    self.wd_manager.upload_to_remote()

            self.ds = Dataset(self.sc, self.ds_name, self.client_email,
                              input_path, self.ds_config, self.wd_manager,
                              self.db)
            self.ds.save_ds_meta()

            self.store_job_meta()

            theor_peaks_gen = TheorPeaksGenerator(self.sc, self.sm_config,
                                                  self.ds_config)
            theor_peaks_gen.run()

            target_adducts = self.ds_config['isotope_generation']['adducts']
            self.fdr = FDR(self.job_id,
                           self.sf_db_id,
                           decoy_sample_size=20,
                           target_adducts=target_adducts,
                           db=self.db)
            self.fdr.decoy_adduct_selection()
            self.formulas = FormulasSegm(self.job_id, self.sf_db_id,
                                         self.ds_config, self.db)

            # search_alg = MSMBasicSearch(self.sc, self.ds, self.formulas, self.fdr, self.ds_config)
            search_alg = MSMExtraFeats(self.sc, self.ds, self.formulas,
                                       self.fdr, self.ds_config)
            sf_metrics_df, sf_iso_images = search_alg.search()

            search_results = SearchResults(
                self.sf_db_id, self.ds_id, self.job_id, self.ds_name,
                self.formulas.get_sf_adduct_peaksn(), self.db, self.sm_config,
                self.ds_config)
            search_results.sf_metrics_df = sf_metrics_df
            search_results.sf_iso_images = sf_iso_images
            search_results.metrics = search_alg.metrics
            search_results.nrows, search_results.ncols = self.ds.get_dims()
            search_results.store()

            es = ESExporter(self.sm_config)
            es.index_ds(self.db, self.ds_name,
                        self.ds_config['database']['name'])

        except Exception:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            logger.error('\n'.join(
                traceback.format_exception(exc_type, exc_value,
                                           exc_traceback)))
        finally:
            if self.sc:
                # self.sc.show_profiles()
                self.sc.stop()
            if self.db:
                self.db.close()