Esempio n. 1
0
    def _run_annotation_job(self, mol_db):
        try:
            self.store_job_meta(mol_db.id)
            mol_db.set_job_id(self._job_id)

            logger.info(
                "Running new job ds_id: %s, ds_name: %s, db_name: %s, db_version: %s",
                self._ds.id, self._ds.name, mol_db.name, mol_db.version)

            target_adducts = self._ds.config['isotope_generation']['adducts']
            self._fdr = FDR(job_id=self._job_id,
                            decoy_sample_size=20,
                            target_adducts=target_adducts,
                            db=self._db)

            isocalc = IsocalcWrapper(self._ds.config['isotope_generation'])
            centroids_gen = IonCentroidsGenerator(sc=self._sc,
                                                  moldb_name=mol_db.name,
                                                  isocalc=isocalc)
            polarity = self._ds.config['isotope_generation']['charge'][
                'polarity']
            all_adducts = list(
                set(self._sm_config['defaults']['adducts'][polarity])
                | set(DECOY_ADDUCTS))
            centroids_gen.generate_if_not_exist(isocalc=isocalc,
                                                sfs=mol_db.sfs,
                                                adducts=all_adducts)
            target_ions = centroids_gen.ions(target_adducts)
            self._fdr.decoy_adducts_selection(target_ions)

            search_alg = MSMBasicSearch(sc=self._sc,
                                        ds=self._ds,
                                        ds_reader=self._ds_reader,
                                        mol_db=mol_db,
                                        centr_gen=centroids_gen,
                                        fdr=self._fdr,
                                        ds_config=self._ds.config)
            ion_metrics_df, ion_iso_images = search_alg.search()

            search_results = SearchResults(mol_db.id, self._job_id,
                                           search_alg.metrics.keys())
            mask = self._ds_reader.get_2d_sample_area_mask()
            img_store_type = self._ds.get_ion_img_storage_type(self._db)
            search_results.store(ion_metrics_df, ion_iso_images, mask,
                                 self._db, self._img_store, img_store_type)
        except Exception as e:
            self._db.alter(
                JOB_UPD_STATUS_FINISH,
                params=(JobStatus.FAILED,
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        self._job_id))
            msg = 'Job failed(ds_id={}, mol_db={}): {}'.format(
                self._ds.id, mol_db, str(e))
            raise JobFailedError(msg) from e
        else:
            self._db.alter(
                JOB_UPD_STATUS_FINISH,
                params=(JobStatus.FINISHED,
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        self._job_id))
Esempio n. 2
0
def test_estimate_fdr_digitize_works():
    fdr = FDR(job_id=0, decoy_sample_size=1, target_adducts=['+H'], db=None)
    fdr.fdr_levels = [0.4, 0.8]
    fdr.td_df = pd.DataFrame([['C1', '+H', '+Cu'],
                              ['C2', '+H', '+Ag'],
                              ['C3', '+H', '+Cl'],
                              ['C4', '+H', '+Co']],
                             columns=['sf', 'ta', 'da'])

    msm_df = pd.DataFrame([['C1', '+H', 1.0],
                          ['C2', '+H', 0.75],
                          ['C3', '+H', 0.5],
                          ['C4', '+H', 0.25],
                          ['C1', '+Cu', 0.75],
                          ['C2', '+Ag', 0.3],
                          ['C3', '+Cl', 0.25],
                          ['C4', '+Co', 0.1]],
                          columns=['sf', 'adduct', 'msm']).set_index(['sf', 'adduct']).sort_index()
    exp_sf_df = pd.DataFrame([['C1', '+H', 0.4],
                              ['C2', '+H', 0.4],
                              ['C3', '+H', 0.4],
                              ['C4', '+H', 0.8]],
                             columns=['sf', 'adduct', 'fdr']).set_index(['sf', 'adduct'])

    assert_frame_equal(fdr.estimate_fdr(msm_df), exp_sf_df)
Esempio n. 3
0
def test_fdr_decoy_adduct_selection_saves_corr():
    db_mock = MagicMock(DB)
    db_mock.select.return_value = [(1,)]

    fdr = FDR(job_id=0, decoy_sample_size=2, target_adducts=['+H', '+K'], db=db_mock)

    exp_target_decoy_df = pd.DataFrame([('H2O', '+H', '+He'),
                                        ('H2O', '+H', '+Li'),
                                        ('H2O', '+K', '+He'),
                                        ('H2O', '+K', '+Li')],
                                       columns=['sf', 'ta', 'da'])

    fdr.decoy_adducts_selection(target_ions=[('H2O', '+H'), ('H2O', '+K')])

    assert_frame_equal(fdr.td_df.sort_values(by=['sf', 'ta', 'da']).reset_index(drop=True),
                       exp_target_decoy_df.sort_values(by=['sf', 'ta', 'da']).reset_index(drop=True))
Esempio n. 4
0
def test_estimate_fdr_returns_correct_df():
    fdr = FDR(job_id=0, decoy_sample_size=2, target_adducts=['+H'], db=None)
    fdr.fdr_levels = [0.2, 0.8]
    fdr.td_df = pd.DataFrame([['H2O', '+H', '+Cu'], ['H2O', '+H', '+Co'],
                              ['C2H2', '+H', '+Ag'], ['C2H2', '+H', '+Ar']],
                             columns=['sf', 'ta', 'da'])

    msm_df = pd.DataFrame(
        [['H2O', '+H', 0.85], ['C2H2', '+H', 0.5], ['H2O', '+Cu', 0.5],
         ['H2O', '+Co', 0.5], ['C2H2', '+Ag', 0.75], ['C2H2', '+Ar', 0.0]],
        columns=['sf', 'adduct', 'msm']).set_index(['sf',
                                                    'adduct']).sort_index()
    exp_sf_df = pd.DataFrame([['H2O', '+H', 0.2], ['C2H2', '+H', 0.8]],
                             columns=['sf', 'adduct',
                                      'fdr']).set_index(['sf', 'adduct'])

    assert_frame_equal(fdr.estimate_fdr(msm_df), exp_sf_df)
Esempio n. 5
0
def test_fdr_decoy_adduct_selection_saves_corr():
    db_mock = MagicMock(DB)
    db_mock.select.return_value = [(1, )]

    fdr = FDR(0, 0, 2, ['+H', '+K'], db_mock)

    def assert_df_values_equal(self, other):
        assert set(self) == set(other)

    exp_target_decoy_df = pd.DataFrame([(1, '+H', '+He'), (1, '+H', '+Li'),
                                        (1, '+K', '+He'), (1, '+K', '+Li')],
                                       columns=['sf_id', 'ta', 'da'])
    fdr._save_target_decoy_df = MagicMock(
        side_effect=lambda: assert_df_values_equal(exp_target_decoy_df, fdr.
                                                   td_df))

    fdr.decoy_adduct_selection()
Esempio n. 6
0
def test_estimate_fdr_digitize_works():
    fdr = FDR(0, 0, 1, ['+H'], None)
    fdr.fdr_levels = [0.4, 0.8]
    fdr.td_df = pd.DataFrame([[1, '+H', '+Cu'], [2, '+H', '+Ag'],
                              [3, '+H', '+Cl'], [4, '+H', '+Co']],
                             columns=['sf_id', 'ta', 'da'])

    msm_df = pd.DataFrame(
        [[1, '+H', 1.0], [2, '+H', 0.75], [3, '+H', 0.5], [4, '+H', 0.25],
         [1, '+Cu', 0.75], [2, '+Ag', 0.3], [3, '+Cl', 0.25], [4, '+Co', 0.1]],
        columns=['sf_id', 'adduct', 'msm']).set_index(['sf_id',
                                                       'adduct']).sort_index()
    exp_sf_df = pd.DataFrame(
        [[1, '+H', 0.4], [2, '+H', 0.4], [3, '+H', 0.4], [4, '+H', 0.8]],
        columns=['sf_id', 'adduct', 'fdr']).set_index(['sf_id', 'adduct'])

    assert_frame_equal(fdr.estimate_fdr(msm_df), exp_sf_df)
Esempio n. 7
0
def test_estimate_fdr_returns_correct_df():
    fdr = FDR(0, 0, 2, ['+H'], None)
    fdr.fdr_levels = [0.2, 0.8]
    fdr.td_df = pd.DataFrame([[1, '+H', '+Cu'], [1, '+H', '+Co'],
                              [2, '+H', '+Ag'], [2, '+H', '+Ar']],
                             columns=['sf_id', 'ta', 'da'])

    msm_df = pd.DataFrame([[1, '+H', 0.85], [2, '+H', 0.5], [1, '+Cu', 0.5],
                           [1, '+Co', 0.5], [2, '+Ag', 0.75], [2, '+Ar', 0.0]],
                          columns=['sf_id', 'adduct',
                                   'msm']).set_index(['sf_id',
                                                      'adduct']).sort_index()
    exp_sf_df = pd.DataFrame([[1, '+H', 0.2], [2, '+H', 0.8]],
                             columns=['sf_id', 'adduct',
                                      'fdr']).set_index(['sf_id', 'adduct'])

    assert_frame_equal(fdr.estimate_fdr(msm_df), exp_sf_df)
Esempio n. 8
0
def test_ions():
    sfs = ['H2O', 'C5H2OH']
    target_adducts = ['+H', '+Na']
    decoy_sample_size = 5

    fdr = FDR(job_id=0, decoy_sample_size=decoy_sample_size,
              target_adducts=target_adducts, db=None)
    fdr.decoy_adducts_selection(target_ions=[('H2O', '+H'), ('H2O', '+Na'),
                                             ('C5H2OH', '+H'), ('C5H2OH', '+Na')])
    ions = fdr.ion_tuples()

    assert type(ions) == list
    # total number varies because different (sf, adduct) pairs may receive the same (sf, decoy_adduct) pair
    assert len(sfs) * decoy_sample_size + len(sfs) * len(target_adducts) < \
           len(ions) <= \
           len(sfs) * len(target_adducts) * decoy_sample_size + len(sfs) * len(target_adducts)
    target_ions = [(sf, adduct) for sf, adduct in product(sfs, target_adducts)]
    assert set(target_ions).issubset(set(map(tuple, ions)))
Esempio n. 9
0
    def run(self, input_path, ds_config_path, clean=False):
        """ Entry point of the engine. Molecule search is completed in several steps:
         * Copying input data to the engine work dir
         * Conversion input data (imzML+ibd) to plain text format. One line - one spectrum data
         * Generation and saving to the database theoretical peaks for all formulas from the molecule database
         * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner.
         * Saving results (isotope images and their metrics of quality for each putative molecule) to the database

        Args
        -------
        input_path : string
            Path to the dataset folder with .imzML and .ibd files
        ds_config_path: string
            Path to the dataset config file
        clean : bool
            Clean all interim data files before starting molecule search
        """
        try:
            self.wd_manager = WorkDirManager(self.ds_name)
            if clean:
                self.wd_manager.clean()

            self.wd_manager.copy_input_data(input_path, ds_config_path)

            self._read_ds_config()
            logger.info('Dataset config:\n%s', pformat(self.ds_config))

            self._configure_spark()
            self._init_db()

            if not self.wd_manager.exists(self.wd_manager.txt_path):
                imzml_converter = ImzmlTxtConverter(
                    self.ds_name, self.wd_manager.local_dir.imzml_path,
                    self.wd_manager.local_dir.txt_path,
                    self.wd_manager.local_dir.coord_path)
                imzml_converter.convert()

                if not self.wd_manager.local_fs_only:
                    self.wd_manager.upload_to_remote()

            self.ds = Dataset(self.sc, self.ds_name, self.client_email,
                              input_path, self.ds_config, self.wd_manager,
                              self.db)
            self.ds.save_ds_meta()

            self.store_job_meta()

            theor_peaks_gen = TheorPeaksGenerator(self.sc, self.sm_config,
                                                  self.ds_config)
            theor_peaks_gen.run()

            target_adducts = self.ds_config['isotope_generation']['adducts']
            self.fdr = FDR(self.job_id,
                           self.sf_db_id,
                           decoy_sample_size=20,
                           target_adducts=target_adducts,
                           db=self.db)
            self.fdr.decoy_adduct_selection()
            self.formulas = FormulasSegm(self.job_id, self.sf_db_id,
                                         self.ds_config, self.db)

            # search_alg = MSMBasicSearch(self.sc, self.ds, self.formulas, self.fdr, self.ds_config)
            search_alg = MSMExtraFeats(self.sc, self.ds, self.formulas,
                                       self.fdr, self.ds_config)
            sf_metrics_df, sf_iso_images = search_alg.search()

            search_results = SearchResults(
                self.sf_db_id, self.ds_id, self.job_id, self.ds_name,
                self.formulas.get_sf_adduct_peaksn(), self.db, self.sm_config,
                self.ds_config)
            search_results.sf_metrics_df = sf_metrics_df
            search_results.sf_iso_images = sf_iso_images
            search_results.metrics = search_alg.metrics
            search_results.nrows, search_results.ncols = self.ds.get_dims()
            search_results.store()

            es = ESExporter(self.sm_config)
            es.index_ds(self.db, self.ds_name,
                        self.ds_config['database']['name'])

        except Exception:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            logger.error('\n'.join(
                traceback.format_exception(exc_type, exc_value,
                                           exc_traceback)))
        finally:
            if self.sc:
                # self.sc.show_profiles()
                self.sc.stop()
            if self.db:
                self.db.close()