def migrate_optical_images(ds_id):
    output.print('Migrating optical images')

    with timeit():
        output.print('Transferring images and updating database...')
        db = DB()
        rows = db.select(SEL_OPTICAL_IMGS, params=(ds_id,))
        for opt_image_id, opt_image_url in rows:
            if not opt_image_url and opt_image_id:
                transfer_images(
                    ds_id,
                    'optical_images',
                    image_storage.OPTICAL,
                    [opt_image_id],
                )
                opt_image_url = image_storage.get_image_url(
                    image_storage.OPTICAL, ds_id, opt_image_id
                )
                db.alter(UPD_OPTICAL_IMGS, params=(opt_image_url, opt_image_id))

        opt_thumb_id, opt_thumb_url = db.select_one(SEL_OPT_THUMB, params=(ds_id,))
        if not opt_thumb_url and opt_thumb_id:
            transfer_images(
                ds_id,
                'optical_images',
                image_storage.OPTICAL,
                [opt_thumb_id],
            )
            opt_thumb_url = image_storage.get_image_url(image_storage.OPTICAL, ds_id, opt_thumb_id)
            db.alter(UPD_OPT_THUMB, params=(opt_thumb_url, ds_id))
Beispiel #2
0
def del_diagnostics(ds_id: str, job_ids: Optional[List[int]] = None):
    db = DB()
    if job_ids is None:
        existing = db.select_with_fields(
            'SELECT id, images FROM dataset_diagnostic dd WHERE dd.ds_id = %s',
            [ds_id],
        )
    else:
        existing = db.select_with_fields(
            'SELECT id, images FROM dataset_diagnostic dd '
            'WHERE dd.ds_id = %s AND dd.job_id = ANY(%s)',
            [ds_id, job_ids],
        )

    if existing:
        # Delete existing images
        image_ids = [
            img['image_id'] for row in existing for img in row['images'] or []
        ]
        image_storage.delete_images(image_storage.DIAG, ds_id, image_ids)

        # Delete existing DB rows
        db.alter(
            'DELETE FROM dataset_diagnostic WHERE id = ANY(%s::uuid[])',
            ([row['id'] for row in existing], ),
        )
Beispiel #3
0
def create_test_db():
    db_config = dict(database='postgres', user='******', host='localhost')
    db = DB(db_config, autocommit=True)
    db.alter('DROP DATABASE IF EXISTS sm_test')
    db.alter('CREATE DATABASE sm_test')
    db.close()

    local('psql -h localhost -U sm sm_test < {}'.format(join(proj_root(), 'scripts/create_schema.sql')))
Beispiel #4
0
 def fin():
     db_config = dict(database='postgres',
                      user='******',
                      host='localhost',
                      password='******')
     db = DB(db_config, autocommit=True)
     db.alter('DROP DATABASE IF EXISTS sm_test')
     db.close()
Beispiel #5
0
 def fin():
     db = DB(db_config, autocommit=True)
     try:
         db.alter('DROP DATABASE IF EXISTS sm_test')
     except Exception as e:
         logging.getLogger('engine').warning(
             'Drop sm_test database failed: %s', e)
     finally:
         db.close()
Beispiel #6
0
def create_test_db():
    db_config = dict(database='postgres', user='******', host='localhost')
    db = DB(db_config, autocommit=True)
    db.alter('DROP DATABASE IF EXISTS sm_test')
    db.alter('CREATE DATABASE sm_test')
    db.close()

    local('psql -h localhost -U sm sm_test < {}'.format(
        join(proj_root(), 'scripts/create_schema.sql')))
def fill_test_db(create_test_db, drop_test_db):
    db_config = dict(database='sm_test', user='******', host='localhost', password='******')
    db = DB(db_config)
    try:
        db.alter('TRUNCATE dataset CASCADE')
        db.insert("INSERT INTO dataset VALUES (%s, %s, %s, %s, %s)",
                  [(1, 'ds_name', '/ds_path', json.dumps({}), json.dumps({}))])
        db.alter('TRUNCATE coordinates CASCADE')
    except:
        raise
    finally:
        db.close()
Beispiel #8
0
def fill_test_db(create_test_db, drop_test_db):
    db_config = dict(database='sm_test',
                     user='******',
                     host='localhost',
                     password='******')
    db = DB(db_config)
    try:
        db.alter('TRUNCATE dataset CASCADE')
        db.insert("INSERT INTO dataset VALUES (%s, %s, %s, %s, %s)",
                  [(1, 'ds_name', '/ds_path', json.dumps({}), json.dumps({}))])
        db.alter('TRUNCATE coordinates CASCADE')
    except:
        raise
    finally:
        db.close()
Beispiel #9
0
def create_fill_test_db(create_test_db, drop_test_db):
    db_config = dict(database='sm_test', user='******', host='localhost', password='******')
    db = DB(db_config)
    try:
        db.alter('TRUNCATE formula_db CASCADE')
        db.insert('INSERT INTO formula_db VALUES (%s, %s, %s)', [(0, '2016-01-01', 'HMDB')])
        db.insert('INSERT INTO formula VALUES (%s, %s, %s, %s, %s)', [(9, 0, '04138', 'Au', 'Gold')])
        db.insert('INSERT INTO agg_formula VALUES (%s, %s, %s, %s, %s)', [(9, 0, 'Au', ['04138'], ['Gold'])])
        db.alter('TRUNCATE theor_peaks CASCADE')
        db.insert('INSERT INTO theor_peaks VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)',
                  [(0, 9, '+H', 0.01, 1, 10000, [100, 200], [10, 1], [], [])])
    except:
        raise
    finally:
        db.close()
def migrate_ion_thumbnail(ds_id):
    output.print('Migrating ion thumbnail images')

    with timeit():
        output.print('Transferring images and updating database...')
        db = DB()
        ion_thumb_id, ion_thumbnail_url = db.select_one(SEL_ION_THUMB, params=(ds_id,))
        if not ion_thumbnail_url and ion_thumb_id:
            transfer_images(
                ds_id,
                'ion_thumbnails',
                image_storage.THUMB,
                [ion_thumb_id],
            )
            ion_thumb_url = image_storage.get_image_url(image_storage.THUMB, ds_id, ion_thumb_id)
            db.alter(UPD_ION_THUMB, params=(ion_thumb_url, ds_id))
Beispiel #11
0
def update_core_metabolome_database():
    db = DB()
    rows = db.select_with_fields(
        "SELECT * FROM molecular_db WHERE name = 'core_metabolome_v3'")
    if rows:
        moldb = rows[0]

        logger.info(f'Updating molecular database: {moldb}')

        moldb['name'] = 'CoreMetabolome'
        moldb['version'] = 'v3'
        moldb['full_name'] = 'Core Metabolome Database'
        moldb[
            'description'] = 'METASPACE database of core mammalian metabolites and lipids'
        moldb['link'] = 'https://metaspace2020.eu'
        moldb['citation'] = ttdoc(tttext('In preparation'))
        moldb['group_id'] = None
        moldb['is_public'] = True

        db.alter(
            ("UPDATE molecular_db "
             "SET name = %s, version = %s, full_name = %s, description = %s,"
             "    link = %s, citation = %s, group_id = %s, is_public = %s "
             "WHERE id = %s;"),
            params=(
                moldb['name'],
                moldb['version'],
                moldb['full_name'],
                moldb['description'],
                moldb['link'],
                moldb['citation'],
                moldb['group_id'],
                moldb['is_public'],
                moldb['id'],
            ),
        )

    rows = db.select_with_fields(
        "SELECT * FROM molecular_db WHERE name = 'CoreMetabolome'")
    if rows:
        logger.info(f'Updated database: {rows[0]}')
    else:
        logger.error(f'Did not find database "CoreMetabolome"')
Beispiel #12
0
def del_jobs(ds: Dataset, moldb_ids: Optional[Iterable[int]] = None):
    """
    Delete a dataset's jobs for the specified moldbs, or all jobs if moldb_ids is None.
    Also cleans up the annotations from ElasticSearch and deletes their ion images.
    """
    db = DB()
    es = ESExporter(db)

    if moldb_ids is None:
        moldb_ids = get_ds_moldb_ids(ds.id)
    moldbs = molecular_db.find_by_ids(moldb_ids)

    job_ids = DB().select_onecol(
        'SELECT j.id FROM job j WHERE ds_id = %s AND moldb_id = ANY(%s)',
        (ds.id, list(moldb_ids)))
    del_diagnostics(ds.id, job_ids)

    for moldb in moldbs:
        logger.info(
            f'Deleting isotopic images: ds_id={ds.id} ds_name={ds.name} moldb={moldb}'
        )
        img_id_rows = db.select_onecol(
            'SELECT iso_image_ids '
            'FROM annotation m '
            'JOIN job j ON j.id = m.job_id '
            'JOIN dataset d ON d.id = j.ds_id '
            'WHERE ds_id = %s AND j.moldb_id = %s',
            (ds.id, moldb.id),
        )

        image_ids = [
            img_id for img_ids in img_id_rows for img_id in img_ids
            if img_id is not None
        ]
        image_storage.delete_images(image_storage.ISO, ds.id, image_ids)

        logger.info(
            f"Deleting job results: ds_id={ds.id} ds_name={ds.name} moldb={moldb}"
        )
        db.alter('DELETE FROM job WHERE ds_id = %s and moldb_id = %s',
                 (ds.id, moldb.id))
        es.delete_ds(ds.id, moldb)
def update_public_database_descriptions():
    db = DB()
    public_db_names = db.select(
        'SELECT name FROM molecular_db WHERE is_public = true AND archived = false'
    )
    logger.info(f'Updating public molecular databases: {public_db_names}')

    for (name, ) in public_db_names:
        desc = database_descriptions.get(name, None)
        if desc:
            db.alter(
                "UPDATE molecular_db "
                "SET description = %s, full_name = %s, link = %s, citation = %s "
                "WHERE name = %s;",
                params=(
                    desc['description'],
                    desc['full_name'],
                    desc['link'],
                    desc['citation'],
                    name,
                ),
            )
Beispiel #14
0
def ensure_db_populated(sm_config, analysis_version, database):
    db = DB()
    # Install DB schema if needed
    query = "SELECT COUNT(*) FROM pg_tables WHERE schemaname = 'public' AND tablename = 'dataset'"
    tables_exist = db.select_one(query)[0] >= 1
    if not tables_exist:
        print('Installing DB schema')
        db.alter(DB_SQL_SCHEMA)

    # Import HMDB if needed
    moldb = MOL_DBS[database]
    try:
        molecular_db.find_by_name_version(moldb['name'], moldb['version'])
    except SMError:
        print(f'Importing {database}')
        with TemporaryDirectory() as tmp:
            urlretrieve(moldb['url'], f'{tmp}/moldb.tsv')
            molecular_db.create(moldb['name'], moldb['version'],
                                f'{tmp}/moldb.tsv')

    if analysis_version > 1:
        if len(
                db.select(
                    "SELECT name FROM scoring_model WHERE name = 'v3_default'")
        ) == 0:
            print("Importing v3_default scoring model")
            params = upload_catboost_scoring_model(
                model=Path(proj_root()) /
                '../scoring-models/v3_default/model-2022-01-05T13-45-26.947188-416b1311.cbm',
                bucket=sm_config['lithops']['lithops']['storage_bucket'],
                prefix=f'test_scoring_models/v3_default',
                is_public=False,
            )
            save_scoring_model_to_db(name='v3_default',
                                     type_='catboost',
                                     params=params)
Beispiel #15
0
def test_db(sm_config, request):
    db_config = dict(**sm_config['db'])
    db_config['database'] = 'postgres'

    db = DB(db_config, autocommit=True)
    db.alter('DROP DATABASE IF EXISTS sm_test')
    db.alter('CREATE DATABASE sm_test')
    db.close()

    local('psql -h {} -U {} sm_test < {}'.format(
        sm_config['db']['host'], sm_config['db']['user'],
        Path(proj_root()) / 'scripts/create_schema.sql'))

    def fin():
        db = DB(db_config, autocommit=True)
        try:
            db.alter('DROP DATABASE IF EXISTS sm_test')
        except Exception as e:
            logging.getLogger('engine').warning(
                'Drop sm_test database failed: %s', e)
        finally:
            db.close()

    request.addfinalizer(fin)
class SearchJob(object):
    """ Main class responsible for molecule search. Uses other modules of the engine.

    Args
    ----
    no_clean : bool
        Don't delete interim data files
    """
    def __init__(self, img_store=None, no_clean=False):
        self.no_clean = no_clean
        self._img_store = img_store

        self._job_id = None
        self._sc = None
        self._db = None
        self._ds = None
        self._ds_reader = None
        self._status_queue = None
        self._fdr = None
        self._wd_manager = None
        self._es = None

        self._sm_config = SMConfig.get_conf()

        logger.debug('Using SM config:\n%s', pformat(self._sm_config))

    def _configure_spark(self):
        logger.info('Configuring Spark')
        sconf = SparkConf()
        for prop, value in self._sm_config['spark'].items():
            if prop.startswith('spark.'):
                sconf.set(prop, value)

        if 'aws' in self._sm_config:
            sconf.set("spark.hadoop.fs.s3a.access.key", self._sm_config['aws']['aws_access_key_id'])
            sconf.set("spark.hadoop.fs.s3a.secret.key", self._sm_config['aws']['aws_secret_access_key'])
            sconf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
            sconf.set("spark.hadoop.fs.s3a.endpoint", "s3.{}.amazonaws.com".format(self._sm_config['aws']['aws_region']))

        self._sc = SparkContext(master=self._sm_config['spark']['master'], conf=sconf, appName='SM engine')

    def _init_db(self):
        logger.info('Connecting to the DB')
        self._db = DB(self._sm_config['db'])

    def store_job_meta(self, mol_db_id):
        """ Store search job metadata in the database """
        logger.info('Storing job metadata')
        rows = [(mol_db_id, self._ds.id, 'STARTED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))]
        self._job_id = self._db.insert_return(JOB_INS, rows=rows)[0]

    def _run_annotation_job(self, mol_db):
        try:
            self.store_job_meta(mol_db.id)
            mol_db.set_job_id(self._job_id)

            logger.info("Running new job ds_id: %s, ds_name: %s, db_name: %s, db_version: %s",
                        self._ds.id, self._ds.name, mol_db.name, mol_db.version)

            target_adducts = self._ds.config['isotope_generation']['adducts']
            self._fdr = FDR(job_id=self._job_id,
                            decoy_sample_size=20,
                            target_adducts=target_adducts,
                            db=self._db)

            isocalc = IsocalcWrapper(self._ds.config['isotope_generation'])
            centroids_gen = IonCentroidsGenerator(sc=self._sc, moldb_name=mol_db.name, isocalc=isocalc)
            polarity = self._ds.config['isotope_generation']['charge']['polarity']
            all_adducts = list(set(self._sm_config['defaults']['adducts'][polarity]) | set(DECOY_ADDUCTS))
            centroids_gen.generate_if_not_exist(isocalc=isocalc,
                                                sfs=mol_db.sfs,
                                                adducts=all_adducts)
            target_ions = centroids_gen.ions(target_adducts)
            self._fdr.decoy_adducts_selection(target_ions)

            search_alg = MSMBasicSearch(sc=self._sc, ds=self._ds, ds_reader=self._ds_reader,
                                        mol_db=mol_db, centr_gen=centroids_gen,
                                        fdr=self._fdr, ds_config=self._ds.config)
            ion_metrics_df, ion_iso_images = search_alg.search()

            search_results = SearchResults(mol_db.id, self._job_id, search_alg.metrics.keys())
            mask = self._ds_reader.get_2d_sample_area_mask()
            img_store_type = self._ds.get_ion_img_storage_type(self._db)
            search_results.store(ion_metrics_df, ion_iso_images, mask, self._db, self._img_store, img_store_type)
        except Exception as e:
            self._db.alter(JOB_UPD, params=('FAILED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id))
            msg = 'Job failed(ds_id={}, mol_db={}): {}'.format(self._ds.id, mol_db, str(e))
            raise JobFailedError(msg) from e
        else:
            self._export_search_results_to_es(mol_db, isocalc)

    def _export_search_results_to_es(self, mol_db, isocalc):
        try:
            self._es.index_ds(self._ds.id, mol_db, isocalc)
        except Exception as e:
            self._db.alter(JOB_UPD, params=('FAILED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id))
            msg = 'Export to ES failed(ds_id={}, mol_db={}): {}'.format(self._ds.id, mol_db, str(e))
            raise ESExportFailedError(msg) from e
        else:
            self._db.alter(JOB_UPD, params=('FINISHED', datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self._job_id))

    def _remove_annotation_job(self, mol_db):
        logger.info("Removing job results ds_id: %s, ds_name: %s, db_name: %s, db_version: %s",
                    self._ds.id, self._ds.name, mol_db.name, mol_db.version)
        self._db.alter('DELETE FROM job WHERE ds_id = %s and db_id = %s', params=(self._ds.id, mol_db.id))
        self._es.delete_ds(self._ds.id, mol_db)

    def _moldb_ids(self):
        moldb_service = MolDBServiceWrapper(self._sm_config['services']['mol_db'])
        completed_moldb_ids = {moldb_service.find_db_by_id(db_id)['id']
                               for (_, db_id) in self._db.select(JOB_ID_MOLDB_ID_SEL, params=(self._ds.id,))}
        new_moldb_ids = {moldb_service.find_db_by_name_version(moldb_name)[0]['id']
                         for moldb_name in self._ds.config['databases']}
        return completed_moldb_ids, new_moldb_ids

    def _save_data_from_raw_ms_file(self):
        ms_file_type_config = SMConfig.get_ms_file_handler(self._wd_manager.local_dir.ms_file_path)
        acq_geometry_factory_module = ms_file_type_config['acq_geometry_factory']
        acq_geometry_factory = getattr(import_module(acq_geometry_factory_module['path']),
                                                acq_geometry_factory_module['name'])

        acq_geometry = acq_geometry_factory(self._wd_manager.local_dir.ms_file_path).create()
        self._ds.save_acq_geometry(self._db, acq_geometry)

        self._ds.save_ion_img_storage_type(self._db, ms_file_type_config['img_storage_type'])

    def run(self, ds):
        """ Entry point of the engine. Molecule search is completed in several steps:
            * Copying input data to the engine work dir
            * Conversion input mass spec files to plain text format. One line - one spectrum data
            * Generation and saving to the database theoretical peaks for all formulas from the molecule database
            * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner.
            * Saving results (isotope images and their metrics of quality for each putative molecule) to the database

        Args
        ----
            ds : sm.engine.dataset_manager.Dataset
        """
        try:
            start = time.time()

            self._init_db()
            self._es = ESExporter(self._db)
            self._ds = ds

            if self._sm_config['rabbitmq']:
                self._status_queue = QueuePublisher(config=self._sm_config['rabbitmq'],
                                                    qdesc=SM_DS_STATUS,
                                                    logger=logger)
            else:
                self._status_queue = None
            ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.STARTED)

            self._wd_manager = WorkDirManager(ds.id)
            self._configure_spark()

            if not self.no_clean:
                self._wd_manager.clean()

            self._ds_reader = DatasetReader(self._ds.input_path, self._sc, self._wd_manager)
            self._ds_reader.copy_convert_input_data()

            self._save_data_from_raw_ms_file()
            self._img_store.storage_type = self._ds.get_ion_img_storage_type(self._db)

            ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.STARTED)

            logger.info('Dataset config:\n%s', pformat(self._ds.config))

            completed_moldb_ids, new_moldb_ids = self._moldb_ids()
            for moldb_id in completed_moldb_ids.symmetric_difference(new_moldb_ids):  # ignore ids present in both sets
                mol_db = MolecularDB(id=moldb_id, db=self._db,
                                     iso_gen_config=self._ds.config['isotope_generation'])
                if moldb_id not in new_moldb_ids:
                    self._remove_annotation_job(mol_db)
                elif moldb_id not in completed_moldb_ids:
                    self._run_annotation_job(mol_db)

            ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.FINISHED)

            logger.info("All done!")
            time_spent = time.time() - start
            logger.info('Time spent: %d mins %d secs', *divmod(int(round(time_spent)), 60))
        except Exception as e:
            if self._ds:
                ds.set_status(self._db, self._es, self._status_queue, DatasetStatus.FAILED)
            logger.error(e, exc_info=True)
            raise
        finally:
            if self._sc:
                self._sc.stop()
            if self._db:
                self._db.close()
            if self._wd_manager and not self.no_clean:
                self._wd_manager.clean()
            logger.info('*' * 150)
Beispiel #17
0
class SciTester:
    def __init__(self, sm_config, analysis_version, database):
        reports_path = Path(proj_root()) / 'tests/reports'
        timestamp = datetime.now().replace(microsecond=0).isoformat().replace(
            ':', '-')
        suffix = f'{database}-v{analysis_version}'

        self.sm_config = sm_config
        self.db = DB()

        self.ds_id = '2000-01-01_00h00m01s'
        self.ref_results_path = reports_path / f'spheroid-{suffix}.csv'
        self.output_results_path = reports_path / f'test-{suffix}-{timestamp}.csv'

        self.ds_name = 'sci_test_spheroid_untreated'
        self.ds_data_path = join(self.sm_config['fs']['spark_data_path'],
                                 self.ds_name)
        self.moldb = MOL_DBS[database]
        self.analysis_version = analysis_version
        self.input_path = join(proj_root(), 'tests/data/untreated')
        self.ds_config_path = join(self.input_path, 'config.json')
        self.metrics = [
            'chaos', 'spatial', 'spectral', 'mz_err_abs', 'mz_err_rel', 'msm',
            'fdr'
        ]

        self.comparison_df = None

    def fetch_search_res_df(self):
        query = ("SELECT m.formula, m.adduct, m.msm, m.fdr, m.stats "
                 "FROM annotation m "
                 "JOIN job j ON j.id = m.job_id "
                 "WHERE j.ds_id = %s "
                 "ORDER BY formula, adduct ")

        rows = self.db.select_with_fields(query, params=(self.ds_id, ))
        return pd.DataFrame([{
            'formula': r['formula'],
            'adduct': r['adduct'],
            'msm': r['msm'],
            'fdr': r['fdr'],
            **r['stats'],
        } for r in rows])

    def save_reference_results(self):
        results_df = self.fetch_search_res_df()

        cols = ['formula', 'adduct', *self.metrics]
        results_df[cols].to_csv(self.ref_results_path, index=False)

        print(
            f'Successfully saved reference search results to {self.ref_results_path}'
        )

    def save_comparison_results(self):
        self.comparison_df.to_csv(self.output_results_path, index=False)

    @staticmethod
    def print_metric_hist(metric_vals):
        if 0.2 < np.max(metric_vals) - np.min(metric_vals) <= 3.0:
            # For metrics in the range -1.0 to 1.0, aligned bins of 0.1 are easier to read
            min_edge = np.floor(np.min(metric_vals) * 10) / 10
            max_edge = np.ceil(np.max(metric_vals) * 10) / 10
            n_bins = int(np.round((max_edge - min_edge) * 10))
        else:
            # Otherwise use unaligned bins
            min_edge = np.min(metric_vals)
            max_edge = np.max(metric_vals)
            n_bins = 10
        bins = np.linspace(min_edge, max_edge, n_bins + 1)
        metric_freq, metric_interv = np.histogram(metric_vals, bins=bins)

        for lo, hi, freq in zip(metric_interv[:-1], metric_interv[1:],
                                metric_freq):
            print(f'{lo:f}-{hi:f}: {freq}')

    def print_differences(self):
        df = self.comparison_df
        missing_df = df[df.matching == 'ref_only']
        unexpected_df = df[df.matching == 'new_only']
        common_df = df[df.matching == '']
        n_ref = df.matching.isin({'ref_only', ''}).count()
        n_new = df.matching.isin({'new_only', ''}).count()

        print(
            f'MISSED FORMULAS: {len(missing_df)} ({len(missing_df) * 100 / n_ref:.1f}%)'
        )
        print(
            f'FALSE DISCOVERY: {len(unexpected_df)} ({len(unexpected_df) * 100 / n_new:.1f}%)'
        )

        differing_metrics = [
            metric for metric in self.metrics
            if common_df[f'{metric}_differs'].any()
        ]
        if differing_metrics:
            for metric in differing_metrics:
                print(f'{metric}_new - {metric}_ref histogram: ')
                self.print_metric_hist(common_df[f'{metric}_new'] -
                                       common_df[f'{metric}_ref'])
                print()
        else:
            print('All metrics equal in common annotations')

    def fdr_differs(self, fdr_ref, fdr_new):
        if self.analysis_version == 1:
            # FDRs are quantized - allow them to jump up/down one level
            levels = [0.0501, 0.1001, 0.2001, 0.5001]
            ref_level = next(
                (i for i, level in enumerate(levels) if fdr_ref < level),
                len(levels))
            new_level = next(
                (i for i, level in enumerate(levels) if fdr_new < level),
                len(levels))
            return abs(ref_level - new_level) > 1
        else:
            # Allow +/- 10% relative difference OR +/- 5% FDR absolute difference to compensate for
            # possible differences if the decoys are sampled differently.
            return not np.isclose(fdr_ref, fdr_new, rtol=0.1, atol=0.05)

    def make_comparison_df(self):
        ref_results = pd.read_csv(self.ref_results_path)
        new_results = self.fetch_search_res_df()

        df = ref_results.merge(
            new_results,
            on=['formula', 'adduct'],
            how='outer',
            suffixes=('_ref', '_new'),
            indicator='matching',
        )
        df['matching'] = df.matching.cat.rename_categories({
            'left_only': 'ref_only',
            'right_only': 'new_only',
            'both': ''
        })

        # Interleave columns for easy side-by-side comparison
        cols = ['formula', 'adduct', 'matching']
        for col in self.metrics:
            cols.append(f'{col}_ref')
            cols.append(f'{col}_new')
        df = df[cols]

        # Add "differs" fields indicating whether the values have changed enough to be considered
        # different from the originals.
        for col in self.metrics:
            if col == 'fdr':
                df[f'fdr_differs'] = [
                    self.fdr_differs(row.fdr_ref, row.fdr_new)
                    for row in df[['fdr_ref', 'fdr_new']].itertuples()
                ]
            else:
                df[f'{col}_differs'] = ~np.isclose(df[f'{col}_ref'],
                                                   df[f'{col}_new'])

        self.comparison_df = df

    def search_results_are_different(self):
        annotations_mismatch = (self.comparison_df.matching != '').any()
        metrics_differ = any(self.comparison_df[f'{metric}_differs'].any()
                             for metric in self.metrics)
        return annotations_mismatch or metrics_differ

    @classmethod
    def _patch_image_storage(cls):
        class ImageStorageMock:
            ISO = image_storage.ISO

            def __init__(self, *args, **kwargs):
                pass

            def post_image(self, *args, **kwargs):
                pass

        from sm.engine.annotation_spark import search_results

        search_results.ImageStorage = ImageStorageMock

    def run_search(self, store_images=False, use_lithops=False):
        if not store_images:
            self._patch_image_storage()

        moldb_id = molecular_db.find_by_name_version(self.moldb['name'],
                                                     self.moldb['version']).id

        os.environ['PYSPARK_PYTHON'] = sys.executable

        ds = create_ds_from_files(self.ds_id, self.ds_name, self.input_path)
        ds.config['analysis_version'] = self.analysis_version
        ds.config['fdr'][
            'scoring_model'] = 'v3_default' if self.analysis_version > 1 else None
        ds.config['database_ids'] = [moldb_id]

        self.db.alter('DELETE FROM job WHERE ds_id=%s', params=(ds.id, ))
        ds.save(self.db, allow_insert=True)
        perf = NullProfiler()
        if use_lithops:
            # Override the runtime to force it to run without docker.
            lithops_executor.RUNTIME_CF_VPC = 'python'
            lithops_executor.RUNTIME_CE = 'python'

            executor = Executor(self.sm_config['lithops'], perf)
            job = ServerAnnotationJob(
                executor,
                ds,
                perf,
                self.sm_config,
                store_images=store_images,
            )
            job.run(debug_validate=True)
        else:
            AnnotationJob(ds, perf).run()

        self.make_comparison_df()

    def clear_data_dirs(self):
        path = Path(self.ds_data_path)
        if path.exists():
            path.rmdir()
Beispiel #18
0
 def fin():
     db_config = dict(database='postgres', user='******', host='localhost', password='******')
     db = DB(db_config, autocommit=True)
     db.alter('DROP DATABASE IF EXISTS sm_test')
     db.close()
            'BraChemDB-2018-01',
            'ChEBI-2018-01',
            'ECMDB-2018-12',
            'HMDB-v4',
            'HMDB-v4-cotton',
            'HMDB-v4-endogenous',
            'LipidMaps-2017-12-12',
            'PAMDB-v1.0',
            'SwissLipids-2018-02-02',
            'HMDB',
            'ChEBI',
            'LIPID_MAPS',
            'SwissLipids',
            'COTTON_HMDB',
            'HMDB-v2.5',
            'HMDB-v2.5-cotton',
        )
        db.alter(PUBLIC_MOLDB_UPD, params=(PUBLIC_DATABASE_NAMES, ))

        ARCHIVED_MOLDB_UPD = 'UPDATE molecular_db SET archived = true WHERE name IN %s'
        ARCHIVED_DATABASE_NAMES = (
            'HMDB',
            'ChEBI',
            'LIPID_MAPS',
            'SwissLipids',
            'COTTON_HMDB',
            'HMDB-v2.5',
            'HMDB-v2.5-cotton',
        )
        db.alter(ARCHIVED_MOLDB_UPD, params=(ARCHIVED_DATABASE_NAMES, ))
import pandas as pd

from sm.engine.db import DB, ConnectionPool
from sm.engine.config import init_loggers

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Update molecular database molecule names')
    parser.add_argument('--config',
                        default='conf/config.json',
                        help='SM config path')
    parser.add_argument('file_path', help='Path to file with new names')
    args = parser.parse_args()
    init_loggers()
    logger = logging.getLogger('engine')

    logger.info(f'Importing new names from {args.file_path}')

    db_config = {"host": "localhost", "database": "mol_db", "user": "******"}
    with ConnectionPool(db_config):
        db = DB()
        names_df = pd.read_csv(args.file_path, sep='\t')[['id', 'name']]

        sql = (
            'WITH molecule_name AS (SELECT UNNEST(%s::text[]) as id_, UNNEST(%s::text[]) as name_) '
            'UPDATE molecule SET mol_name = molecule_name.name_ '
            'FROM molecule_name WHERE molecule.mol_id = molecule_name.id_')
        db.alter(sql,
                 [names_df.id.values.tolist(),
                  names_df.name.values.tolist()])
Beispiel #21
0
def add_diagnostics(diagnostics: List[DatasetDiagnostic]):
    """Upserts dataset diagnostics, overwriting existing values with the same ds_id, job_id, type"""
    # Validate input, as postgres can't enforce the JSON columns have the correct schema,
    # and many places (graphql, python client, etc.) rely on these structures.

    if not diagnostics:
        return

    for diagnostic in diagnostics:
        assert 'ds_id' in diagnostic
        assert 'type' in diagnostic
        images = diagnostic.get('images', [])
        assert all(image['key'] in DiagnosticImageKey for image in images)
        assert all(image['format'] in DiagnosticImageFormat
                   for image in images)
        assert all(image['image_id'] in image['url'] for image in images)
        image_keys = set(
            (image.get('key'), image.get('index')) for image in images)
        assert len(image_keys) == len(
            images), 'diagnostic image keys should be unique'

    db = DB()
    # Find all diagnostics that should be replaced by the new diagnostics
    existing = db.select_with_fields(
        """
        WITH new_diagnostic AS (
            SELECT UNNEST(%s::text[]) as ds_id, UNNEST(%s::int[]) as job_id,
            UNNEST(%s::text[]) as type
        )
        SELECT dd.ds_id, dd.id, dd.images
        FROM new_diagnostic nd
        JOIN dataset_diagnostic dd ON nd.ds_id = dd.ds_id
            AND (nd.job_id = dd.job_id OR (nd.job_id IS NULL AND dd.job_id is NULL))
            AND nd.type = dd.type
        """,
        list(
            map(
                list,
                zip(*((d['ds_id'], d.get('job_id'), d['type'])
                      for d in diagnostics)))),
    )

    if existing:
        logger.debug(
            f'Deleting {len(existing)} existing diagnostics for dataset {existing[0]["ds_id"]}'
        )
        # Delete existing images
        image_ids_by_ds = defaultdict(list)
        for row in existing:
            for img in row['images'] or []:
                image_ids_by_ds[row['ds_id']].append(img['image_id'])
        for ds_id, image_ids in image_ids_by_ds.items():
            image_storage.delete_images(image_storage.DIAG, ds_id, image_ids)

        # Delete existing DB rows
        db.alter(
            'DELETE FROM dataset_diagnostic WHERE id = ANY(%s::uuid[])',
            ([row['id'] for row in existing], ),
        )

    logger.debug(
        f'Inserting {len(diagnostics)} diagnostics for dataset {diagnostics[0]["ds_id"]}'
    )
    db.insert(
        'INSERT INTO dataset_diagnostic (ds_id, job_id, type, updated_dt, data, error, images) '
        'VALUES (%s, %s, %s, %s, %s, %s, %s)',
        [(
            d['ds_id'],
            d.get('job_id'),
            d['type'],
            datetime.now(),
            numpy_json_dumps(d['data']) if d.get('data') is not None else None,
            d.get('error'),
            numpy_json_dumps(d.get('images', [])),
        ) for d in diagnostics],
    )
Beispiel #22
0
class SearchJob(object):
    """ Main class responsible for molecule search. Uses other modules of the engine.

    Args
    ----
    no_clean : bool
        Don't delete interim data files
    """
    def __init__(self, img_store=None, no_clean=False):
        self.no_clean = no_clean
        self._img_store = img_store

        self._job_id = None
        self._sc = None
        self._db = None
        self._ds = None
        self._ds_reader = None
        self._status_queue = None
        self._fdr = None
        self._wd_manager = None
        self._es = None

        self._sm_config = SMConfig.get_conf()

        logger.debug('Using SM config:\n%s', pformat(self._sm_config))

    def _configure_spark(self):
        logger.info('Configuring Spark')
        sconf = SparkConf()
        for prop, value in self._sm_config['spark'].items():
            if prop.startswith('spark.'):
                sconf.set(prop, value)

        if 'aws' in self._sm_config:
            sconf.set("spark.hadoop.fs.s3a.access.key",
                      self._sm_config['aws']['aws_access_key_id'])
            sconf.set("spark.hadoop.fs.s3a.secret.key",
                      self._sm_config['aws']['aws_secret_access_key'])
            sconf.set("spark.hadoop.fs.s3a.impl",
                      "org.apache.hadoop.fs.s3a.S3AFileSystem")
            sconf.set(
                "spark.hadoop.fs.s3a.endpoint", "s3.{}.amazonaws.com".format(
                    self._sm_config['aws']['aws_region']))

        self._sc = SparkContext(master=self._sm_config['spark']['master'],
                                conf=sconf,
                                appName='SM engine')

    def _init_db(self):
        logger.info('Connecting to the DB')
        self._db = DB(self._sm_config['db'])

    def store_job_meta(self, mol_db_id):
        """ Store search job metadata in the database """
        logger.info('Storing job metadata')
        rows = [(mol_db_id, self._ds.id, JobStatus.RUNNING,
                 datetime.now().strftime('%Y-%m-%d %H:%M:%S'))]
        self._job_id = self._db.insert_return(JOB_INS, rows=rows)[0]

    def _run_annotation_job(self, mol_db):
        try:
            self.store_job_meta(mol_db.id)
            mol_db.set_job_id(self._job_id)

            logger.info(
                "Running new job ds_id: %s, ds_name: %s, db_name: %s, db_version: %s",
                self._ds.id, self._ds.name, mol_db.name, mol_db.version)

            target_adducts = self._ds.config['isotope_generation']['adducts']
            self._fdr = FDR(job_id=self._job_id,
                            decoy_sample_size=20,
                            target_adducts=target_adducts,
                            db=self._db)

            isocalc = IsocalcWrapper(self._ds.config['isotope_generation'])
            centroids_gen = IonCentroidsGenerator(sc=self._sc,
                                                  moldb_name=mol_db.name,
                                                  isocalc=isocalc)
            polarity = self._ds.config['isotope_generation']['charge'][
                'polarity']
            all_adducts = list(
                set(self._sm_config['defaults']['adducts'][polarity])
                | set(DECOY_ADDUCTS))
            centroids_gen.generate_if_not_exist(isocalc=isocalc,
                                                sfs=mol_db.sfs,
                                                adducts=all_adducts)
            target_ions = centroids_gen.ions(target_adducts)
            self._fdr.decoy_adducts_selection(target_ions)

            search_alg = MSMBasicSearch(sc=self._sc,
                                        ds=self._ds,
                                        ds_reader=self._ds_reader,
                                        mol_db=mol_db,
                                        centr_gen=centroids_gen,
                                        fdr=self._fdr,
                                        ds_config=self._ds.config)
            ion_metrics_df, ion_iso_images = search_alg.search()

            search_results = SearchResults(mol_db.id, self._job_id,
                                           search_alg.metrics.keys())
            mask = self._ds_reader.get_2d_sample_area_mask()
            img_store_type = self._ds.get_ion_img_storage_type(self._db)
            search_results.store(ion_metrics_df, ion_iso_images, mask,
                                 self._db, self._img_store, img_store_type)
        except Exception as e:
            self._db.alter(
                JOB_UPD_STATUS_FINISH,
                params=(JobStatus.FAILED,
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        self._job_id))
            msg = 'Job failed(ds_id={}, mol_db={}): {}'.format(
                self._ds.id, mol_db, str(e))
            raise JobFailedError(msg) from e
        else:
            self._db.alter(
                JOB_UPD_STATUS_FINISH,
                params=(JobStatus.FINISHED,
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        self._job_id))

    def _remove_annotation_job(self, mol_db):
        logger.info(
            "Removing job results ds_id: %s, ds_name: %s, db_name: %s, db_version: %s",
            self._ds.id, self._ds.name, mol_db.name, mol_db.version)
        self._db.alter('DELETE FROM job WHERE ds_id = %s and db_id = %s',
                       params=(self._ds.id, mol_db.id))
        self._es.delete_ds(self._ds.id, mol_db)

    def _moldb_ids(self):
        completed_moldb_ids = {
            db_id
            for (_, db_id) in self._db.select(JOB_ID_MOLDB_ID_SEL,
                                              params=(self._ds.id, ))
        }
        new_moldb_ids = {
            MolecularDB(name=moldb_name).id
            for moldb_name in self._ds.config['databases']
        }
        return completed_moldb_ids, new_moldb_ids

    def _save_data_from_raw_ms_file(self):
        ms_file_type_config = SMConfig.get_ms_file_handler(
            self._wd_manager.local_dir.ms_file_path)
        acq_geometry_factory_module = ms_file_type_config[
            'acq_geometry_factory']
        acq_geometry_factory = getattr(
            import_module(acq_geometry_factory_module['path']),
            acq_geometry_factory_module['name'])

        acq_geometry = acq_geometry_factory(
            self._wd_manager.local_dir.ms_file_path).create()
        self._ds.save_acq_geometry(self._db, acq_geometry)

        self._ds.save_ion_img_storage_type(
            self._db, ms_file_type_config['img_storage_type'])

    def run(self, ds):
        """ Entry point of the engine. Molecule search is completed in several steps:
            * Copying input data to the engine work dir
            * Conversion input mass spec files to plain text format. One line - one spectrum data
            * Generation and saving to the database theoretical peaks for all formulas from the molecule database
            * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner.
            * Saving results (isotope images and their metrics of quality for each putative molecule) to the database

        Args
        ----
            ds : sm.engine.dataset_manager.Dataset
        """
        try:
            logger.info('*' * 150)
            start = time.time()

            self._init_db()
            self._es = ESExporter(self._db)
            self._ds = ds

            if self._sm_config['rabbitmq']:
                self._status_queue = QueuePublisher(
                    config=self._sm_config['rabbitmq'],
                    qdesc=SM_DS_STATUS,
                    logger=logger)
            else:
                self._status_queue = None

            self._wd_manager = WorkDirManager(ds.id)
            self._configure_spark()

            if not self.no_clean:
                self._wd_manager.clean()

            self._ds_reader = DatasetReader(self._ds.input_path, self._sc,
                                            self._wd_manager)
            self._ds_reader.copy_convert_input_data()

            self._save_data_from_raw_ms_file()
            self._img_store.storage_type = self._ds.get_ion_img_storage_type(
                self._db)

            logger.info('Dataset config:\n%s', pformat(self._ds.config))

            completed_moldb_ids, new_moldb_ids = self._moldb_ids()
            for moldb_id in completed_moldb_ids.symmetric_difference(
                    new_moldb_ids):  # ignore ids present in both sets
                mol_db = MolecularDB(
                    id=moldb_id,
                    db=self._db,
                    iso_gen_config=self._ds.config['isotope_generation'])
                if moldb_id not in new_moldb_ids:
                    self._remove_annotation_job(mol_db)
                elif moldb_id not in completed_moldb_ids:
                    self._run_annotation_job(mol_db)

            logger.info("All done!")
            time_spent = time.time() - start
            logger.info('Time spent: %d mins %d secs',
                        *divmod(int(round(time_spent)), 60))
        finally:
            if self._sc:
                self._sc.stop()
            if self._db:
                self._db.close()
            if self._wd_manager and not self.no_clean:
                self._wd_manager.clean()
            logger.info('*' * 150)
Beispiel #23
0
class SearchJob(object):
    """ Main class responsible for molecule search. Uses other modules of the engine.

    Args
    ----------
    ds_name : string
        A dataset short name
    """
    def __init__(self, client_email, ds_name):
        self.sm_config = SMConfig.get_conf()
        self.client_email = client_email
        self.ds_name = ds_name
        self.ds_id = None
        self.job_id = None
        self.sc = None
        self.db = None
        self.ds = None
        self.fdr = None
        self.formulas = None
        self.ds_config = None
        self.wd_manager = None

    def _read_ds_config(self):
        with open(self.wd_manager.ds_config_path) as f:
            self.ds_config = json.load(f)

    def _configure_spark(self):
        logger.info('Configuring Spark')
        sconf = SparkConf()
        for prop, value in self.sm_config['spark'].iteritems():
            if prop.startswith('spark.'):
                sconf.set(prop, value)

        if 'aws' in self.sm_config:
            sconf.set("spark.hadoop.fs.s3a.access.key",
                      self.sm_config['aws']['aws_access_key_id'])
            sconf.set("spark.hadoop.fs.s3a.secret.key",
                      self.sm_config['aws']['aws_secret_access_key'])
            sconf.set("spark.hadoop.fs.s3a.impl",
                      "org.apache.hadoop.fs.s3a.S3AFileSystem")

        # sconf.set("spark.python.profile", "true")
        self.sc = SparkContext(master=self.sm_config['spark']['master'],
                               conf=sconf,
                               appName='SM engine')
        if not self.sm_config['spark']['master'].startswith('local'):
            self.sc.addPyFile(join(local_path(proj_root()), 'sm.zip'))

    def _init_db(self):
        logger.info('Connecting to the DB')
        self.db = DB(self.sm_config['db'])
        self.sf_db_id = self.db.select_one(
            DB_ID_SEL, self.ds_config['database']['name'])[0]

    def store_job_meta(self):
        """ Store search job metadata in the database """
        logger.info('Storing job metadata')
        self.ds_id = int(self.db.select_one(DS_ID_SEL, self.ds_name)[0])
        self.job_id = self.ds_id
        self.db.alter(DEL_JOB_SQL, self.job_id)
        rows = [(self.job_id, self.sf_db_id, self.ds_id,
                 datetime.now().strftime('%Y-%m-%d %H:%M:%S'))]
        self.db.insert(JOB_INS, rows)

        rows = [(self.job_id, adduct)
                for adduct in self.ds_config['isotope_generation']['adducts']]
        self.db.insert(ADDUCT_INS, rows)

    def run(self, input_path, ds_config_path, clean=False):
        """ Entry point of the engine. Molecule search is completed in several steps:
         * Copying input data to the engine work dir
         * Conversion input data (imzML+ibd) to plain text format. One line - one spectrum data
         * Generation and saving to the database theoretical peaks for all formulas from the molecule database
         * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner.
         * Saving results (isotope images and their metrics of quality for each putative molecule) to the database

        Args
        -------
        input_path : string
            Path to the dataset folder with .imzML and .ibd files
        ds_config_path: string
            Path to the dataset config file
        clean : bool
            Clean all interim data files before starting molecule search
        """
        try:
            self.wd_manager = WorkDirManager(self.ds_name)
            if clean:
                self.wd_manager.clean()

            self.wd_manager.copy_input_data(input_path, ds_config_path)

            self._read_ds_config()
            logger.info('Dataset config:\n%s', pformat(self.ds_config))

            self._configure_spark()
            self._init_db()

            if not self.wd_manager.exists(self.wd_manager.txt_path):
                imzml_converter = ImzmlTxtConverter(
                    self.ds_name, self.wd_manager.local_dir.imzml_path,
                    self.wd_manager.local_dir.txt_path,
                    self.wd_manager.local_dir.coord_path)
                imzml_converter.convert()

                if not self.wd_manager.local_fs_only:
                    self.wd_manager.upload_to_remote()

            self.ds = Dataset(self.sc, self.ds_name, self.client_email,
                              input_path, self.ds_config, self.wd_manager,
                              self.db)
            self.ds.save_ds_meta()

            self.store_job_meta()

            theor_peaks_gen = TheorPeaksGenerator(self.sc, self.sm_config,
                                                  self.ds_config)
            theor_peaks_gen.run()

            target_adducts = self.ds_config['isotope_generation']['adducts']
            self.fdr = FDR(self.job_id,
                           self.sf_db_id,
                           decoy_sample_size=20,
                           target_adducts=target_adducts,
                           db=self.db)
            self.fdr.decoy_adduct_selection()
            self.formulas = FormulasSegm(self.job_id, self.sf_db_id,
                                         self.ds_config, self.db)

            # search_alg = MSMBasicSearch(self.sc, self.ds, self.formulas, self.fdr, self.ds_config)
            search_alg = MSMExtraFeats(self.sc, self.ds, self.formulas,
                                       self.fdr, self.ds_config)
            sf_metrics_df, sf_iso_images = search_alg.search()

            search_results = SearchResults(
                self.sf_db_id, self.ds_id, self.job_id, self.ds_name,
                self.formulas.get_sf_adduct_peaksn(), self.db, self.sm_config,
                self.ds_config)
            search_results.sf_metrics_df = sf_metrics_df
            search_results.sf_iso_images = sf_iso_images
            search_results.metrics = search_alg.metrics
            search_results.nrows, search_results.ncols = self.ds.get_dims()
            search_results.store()

            es = ESExporter(self.sm_config)
            es.index_ds(self.db, self.ds_name,
                        self.ds_config['database']['name'])

        except Exception:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            logger.error('\n'.join(
                traceback.format_exception(exc_type, exc_value,
                                           exc_traceback)))
        finally:
            if self.sc:
                # self.sc.show_profiles()
                self.sc.stop()
            if self.db:
                self.db.close()