def test_search_job_imzml_example_es_export_fails(get_compute_img_metrics_mock, filter_sf_metrics_mock,
                                                  post_images_to_annot_service_mock,
                                                  MolDBServiceWrapperMock, MolDBServiceWrapperMock2,
                                                  sm_config, create_fill_sm_database, es_dsl_search,
                                                  clean_isotope_storage):
    init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock)
    init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock2)

    get_compute_img_metrics_mock.return_value = lambda *args: (0.9, 0.9, 0.9, [100.], [0], [10.])
    filter_sf_metrics_mock.side_effect = lambda x: x

    url_dict = {
        'iso_image_ids': ['iso_image_1', None, None, None]
    }
    post_images_to_annot_service_mock.return_value = {
        35: url_dict,
        44: url_dict
    }

    db = DB(sm_config['db'])

    def throw_exception_function(*args):
        raise Exception('Test')

    try:
        ds_id = '2000-01-01_00h00m'
        upload_dt = datetime.now()
        ds_config_str = open(ds_config_path).read()
        db.insert(Dataset.DS_INSERT, [{
            'id': ds_id,
            'name': test_ds_name,
            'input_path': input_dir_path,
            'upload_dt': upload_dt,
            'metadata': '{}',
            'config': ds_config_str,
            'status': DatasetStatus.QUEUED,
            'is_public': True,
            'mol_dbs': ['HMDB-v4'],
            'adducts': ['+H', '+Na', '+K'],
            'ion_img_storage': 'fs'
        }])

        with patch('sm.engine.search_job.ESExporter.index_ds') as index_ds_mock:
            index_ds_mock.side_effect = throw_exception_function

            img_store = ImageStoreServiceWrapper(sm_config['services']['img_service_url'])
            job = SearchJob(img_store=img_store)
            ds = Dataset.load(db, ds_id)
            job.run(ds)
    except ESExportFailedError as e:
        assert e
        # dataset table asserts
        row = db.select_one('SELECT status from dataset')
        assert row[0] == 'FAILED'
    else:
        raise AssertionError('ESExportFailedError should be raised')
    finally:
        db.close()
        with warn_only():
            local('rm -rf {}'.format(data_dir_path))
def init_moldb():
    db = DB()
    moldb = create_test_molecular_db()
    db.insert(
        "INSERT INTO molecule (mol_id, mol_name, formula, moldb_id) VALUES (%s, %s, %s, %s)",
        rows=[('HMDB0001', 'molecule name', 'C12H24O', moldb.id)],
    )
    return moldb
 def store_iso_img_rows(row_it):
     db = DB(db_config)
     try:
         rows = list(row_it)
         if rows:
             db.insert(SF_ISO_IMGS_INS, rows)
     finally:
         db.close()
def test_sm_daemons_annot_fails(get_compute_img_metrics_mock, filter_sf_metrics_mock,
                                post_images_to_annot_service_mock,
                                MolDBServiceWrapperMock,
                                sm_config, test_db, es_dsl_search,
                                clean_isotope_storage):
    init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock)

    def throw_exception_function(*args):
        raise Exception('Test')
    get_compute_img_metrics_mock.return_value = throw_exception_function
    filter_sf_metrics_mock.side_effect = lambda x: x

    url_dict = {
        'iso_image_ids': ['iso_image_1', None, None, None]
    }
    post_images_to_annot_service_mock.return_value = {
        35: url_dict,
        44: url_dict
    }

    db = DB(sm_config['db'])
    es = ESExporter(db)
    annotate_daemon = None

    try:
        ds_id = '2000-01-01_00h00m'
        upload_dt = datetime.now()
        ds_config_str = open(ds_config_path).read()
        db.insert(Dataset.DS_INSERT, [{
            'id': ds_id,
            'name': test_ds_name,
            'input_path': input_dir_path,
            'upload_dt': upload_dt,
            'metadata': '{}',
            'config': ds_config_str,
            'status': DatasetStatus.QUEUED,
            'is_public': True,
            'mol_dbs': ['HMDB-v4'],
            'adducts': ['+H', '+Na', '+K'],
            'ion_img_storage': 'fs'
        }])

        ds = Dataset.load(db, ds_id)
        queue_pub.publish({'ds_id': ds.id, 'ds_name': ds.name, 'action': 'annotate'})

        run_daemons(db, es)

        # dataset and job tables asserts
        row = db.select_one('SELECT status from dataset')
        assert row[0] == 'FAILED'
        row = db.select_one('SELECT status from job')
        assert row[0] == 'FAILED'
    finally:
        db.close()
        if annotate_daemon:
            annotate_daemon.stop()
        with warn_only():
            local('rm -rf {}'.format(data_dir_path))
def fill_test_db(create_test_db, drop_test_db):
    db_config = dict(database='sm_test', user='******', host='localhost', password='******')
    db = DB(db_config)
    try:
        db.alter('TRUNCATE dataset CASCADE')
        db.insert("INSERT INTO dataset VALUES (%s, %s, %s, %s, %s)",
                  [(1, 'ds_name', '/ds_path', json.dumps({}), json.dumps({}))])
        db.alter('TRUNCATE coordinates CASCADE')
    except:
        raise
    finally:
        db.close()
def fill_db(test_db, sm_config, ds_config):
    upload_dt = '2000-01-01 00:00:00'
    ds_id = '2000-01-01'
    metadata = {"meta": "data"}
    db = DB(sm_config['db'])
    db.insert((
        'INSERT INTO dataset (id, name, input_path, upload_dt, metadata, config, status, '
        'is_public, mol_dbs, adducts) '
        'VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'),
              rows=[(ds_id, 'ds_name', 'input_path', upload_dt,
                     json.dumps(metadata), json.dumps(ds_config),
                     DatasetStatus.FINISHED, True, ['HMDB-v4'
                                                    ], ['+H', '+Na', '+K'])])
Exemple #7
0
def fill_test_db(create_test_db, drop_test_db):
    db_config = dict(database='sm_test',
                     user='******',
                     host='localhost',
                     password='******')
    db = DB(db_config)
    try:
        db.alter('TRUNCATE dataset CASCADE')
        db.insert("INSERT INTO dataset VALUES (%s, %s, %s, %s, %s)",
                  [(1, 'ds_name', '/ds_path', json.dumps({}), json.dumps({}))])
        db.alter('TRUNCATE coordinates CASCADE')
    except:
        raise
    finally:
        db.close()
Exemple #8
0
def create_fill_sm_database(create_test_db, drop_test_db, sm_config):
    proj_dir_path = dirname(dirname(__file__))
    local('psql -h localhost -U sm sm_test < {}'.format(
        join(proj_dir_path, 'scripts/create_schema.sql')))

    db = DB(sm_config['db'])
    try:
        db.insert('INSERT INTO dataset VALUES (%s, %s, %s, %s, %s, %s)',
                  [(0, 'name', 0, 'fpath', json.dumps({}), json.dumps({}))])
        db.insert('INSERT INTO job VALUES (%s, %s, %s, %s, %s, %s, %s, %s)',
                  [(0, 0, 0, '', 0, 0, None, None)])
    except:
        raise
    finally:
        db.close()
def test_new_ds_saves_to_db(test_db, metadata, ds_config):
    db = DB()
    moldb = create_test_molecular_db()
    ds_config['database_ids'] = [moldb.id]
    ds = create_test_ds(config={**ds_config, 'database_ids': [moldb.id]})

    ion_metrics_df = pd.DataFrame({
        'formula': ['H2O', 'H2O', 'CO2', 'CO2', 'H2SO4', 'H2SO4'],
        'adduct': ['+H', '[M]+', '+H', '[M]+', '+H', '[M]+'],
        'fdr': [0.05, 0.1, 0.05, 0.1, 0.05, 0.1],
        'image_id':
        list(map(str, range(6))),
    })
    (job_id, ) = db.insert_return(
        "INSERT INTO job (moldb_id, ds_id, status) VALUES (%s, %s, 'FINISHED') RETURNING id",
        rows=[(moldb.id, ds.id)],
    )
    db.insert(
        'INSERT INTO annotation('
        '   job_id, formula, chem_mod, neutral_loss, adduct, msm, fdr, stats, iso_image_ids'
        ') '
        "VALUES (%s, %s, '', '', %s, 1, %s, '{}', %s)",
        [(job_id, r.formula, r.adduct, r.fdr, [r.image_id])
         for i, r in ion_metrics_df.iterrows()],
    )

    with patch(
            'sm.engine.postprocessing.colocalization.ImageStorage.get_ion_images_for_analysis'
    ) as get_ion_images_for_analysis_mock:
        get_ion_images_for_analysis_mock.side_effect = mock_get_ion_images_for_analysis

        Colocalization(db).run_coloc_job(ds)

    jobs = db.select('SELECT id, error, sample_ion_ids FROM graphql.coloc_job')
    annotations = db.select(
        'SELECT coloc_ion_ids, coloc_coeffs FROM graphql.coloc_annotation')
    ions = db.select('SELECT id FROM graphql.ion')

    assert len(jobs) > 0
    assert not any(job[1] for job in jobs)
    assert jobs[0][2]
    assert len(annotations) > 10
    assert all(len(ann[0]) == len(ann[1]) for ann in annotations)
    assert len(ions) == len(ion_metrics_df)
def fill_db(test_db, metadata, ds_config):
    upload_dt = '2000-01-01 00:00:00'
    ds_id = '2000-01-01'
    db = DB()
    db.insert(
        ('INSERT INTO dataset (id, name, input_path, upload_dt, metadata, config, status, '
         'status_update_dt, is_public) '
         'VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)'),
        rows=[(
            ds_id,
            'ds_name',
            'input_path',
            upload_dt,
            json.dumps(metadata),
            json.dumps(ds_config),
            DatasetStatus.FINISHED,
            upload_dt,
            True,
        )],
    )
    create_test_molecular_db()
Exemple #11
0
def create_fill_test_db(create_test_db, drop_test_db):
    db_config = dict(database='sm_test', user='******', host='localhost', password='******')
    db = DB(db_config)
    try:
        db.alter('TRUNCATE formula_db CASCADE')
        db.insert('INSERT INTO formula_db VALUES (%s, %s, %s)', [(0, '2016-01-01', 'HMDB')])
        db.insert('INSERT INTO formula VALUES (%s, %s, %s, %s, %s)', [(9, 0, '04138', 'Au', 'Gold')])
        db.insert('INSERT INTO agg_formula VALUES (%s, %s, %s, %s, %s)', [(9, 0, 'Au', ['04138'], ['Gold'])])
        db.alter('TRUNCATE theor_peaks CASCADE')
        db.insert('INSERT INTO theor_peaks VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)',
                  [(0, 9, '+H', 0.01, 1, 10000, [100, 200], [10, 1], [], [])])
    except:
        raise
    finally:
        db.close()
Exemple #12
0
def fill_db(test_db, sm_config, ds_config):
    upload_dt = '2000-01-01 00:00:00'
    ds_id = '2000-01-01'
    meta = {"meta": "data"}
    db = DB(sm_config['db'])
    db.insert(
        'INSERT INTO dataset (id, name, input_path, upload_dt, metadata, config, '
        'status, is_public, mol_dbs, adducts) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)',
        rows=[(ds_id, 'ds_name', 'input_path', upload_dt, json.dumps(meta),
               json.dumps(ds_config), 'FINISHED', True, ['HMDB-v4'], ['+H'])])
    db.insert("INSERT INTO job (id, db_id, ds_id) VALUES (%s, %s, %s)",
              rows=[(0, 0, ds_id)])
    db.insert("INSERT INTO sum_formula (id, db_id, sf) VALUES (%s, %s, %s)",
              rows=[(1, 0, 'H2O')])
    db.insert((
        "INSERT INTO iso_image_metrics (job_id, db_id, sf, adduct, iso_image_ids) "
        "VALUES (%s, %s, %s, %s, %s)"),
              rows=[(0, 0, 'H2O', '+H', ['iso_image_1_id', 'iso_image_2_id'])])
    db.close()
Exemple #13
0
def create_fill_sm_database(create_test_db, drop_test_db, sm_config):
    local('psql -h localhost -U sm sm_test < {}'.format(join(proj_dir_path, 'scripts/create_schema.sql')))

    db = DB(sm_config['db'])
    try:
        db.insert('INSERT INTO formula_db VALUES (%s, %s, %s)',
                  [(0, '2016-01-01', 'HMDB')])
        db.insert('INSERT INTO formula VALUES (%s, %s, %s, %s, %s)',
                  [(100, 0, '00001', 'compound_name', 'C12H24O')])
        db.insert('INSERT INTO agg_formula VALUES (%s, %s, %s, %s, %s)',
                  [(10007, 0, 'C12H24O', ['00001'], ['compound_name'])])
    except:
        raise
    finally:
        db.close()
def create_fill_sm_database(create_test_db, drop_test_db, sm_config):
    local('psql -h localhost -U sm sm_test < {}'.format(join(proj_dir_path, 'scripts/create_schema.sql')))

    db = DB(sm_config['db'])
    try:
        db.insert('INSERT INTO formula_db VALUES (%s, %s, %s)',
                  [(0, '2016-01-01', 'HMDB')])
        db.insert('INSERT INTO formula VALUES (%s, %s, %s, %s, %s)',
                  [(100, 0, '00001', 'compound_name', 'C12H24O')])
        db.insert('INSERT INTO agg_formula VALUES (%s, %s, %s, %s, %s)',
                  [(10007, 0, 'C12H24O', ['00001'], ['compound_name'])])
    except:
        raise
    finally:
        db.close()
Exemple #15
0
def fill_db(test_db):
    db = DB()
    db.insert(
        'INSERT INTO graphql.user (id, name, email) VALUES (%s, %s, %s)',
        [(USER_ID, 'name', '*****@*****.**')],
    )
    db.insert(
        'INSERT INTO graphql.group (id, name, short_name) VALUES (%s, %s, %s)',
        [(GROUP_ID, 'test-group', 'test-group')],
    )
    db.insert(
        'INSERT INTO graphql.dataset (id, user_id, group_id) VALUES (%s, %s, %s)',
        [('dataset id', USER_ID, GROUP_ID)],
    )

    yield
Exemple #16
0
class SearchJob(object):
    """ Main class responsible for molecule search. Uses other modules of the engine.

    Args
    ----------
    ds_name : string
        A dataset short name
    """
    def __init__(self, client_email, ds_name):
        self.sm_config = SMConfig.get_conf()
        self.client_email = client_email
        self.ds_name = ds_name
        self.ds_id = None
        self.job_id = None
        self.sc = None
        self.db = None
        self.ds = None
        self.fdr = None
        self.formulas = None
        self.ds_config = None
        self.wd_manager = None

    def _read_ds_config(self):
        with open(self.wd_manager.ds_config_path) as f:
            self.ds_config = json.load(f)

    def _configure_spark(self):
        logger.info('Configuring Spark')
        sconf = SparkConf()
        for prop, value in self.sm_config['spark'].iteritems():
            if prop.startswith('spark.'):
                sconf.set(prop, value)

        if 'aws' in self.sm_config:
            sconf.set("spark.hadoop.fs.s3a.access.key",
                      self.sm_config['aws']['aws_access_key_id'])
            sconf.set("spark.hadoop.fs.s3a.secret.key",
                      self.sm_config['aws']['aws_secret_access_key'])
            sconf.set("spark.hadoop.fs.s3a.impl",
                      "org.apache.hadoop.fs.s3a.S3AFileSystem")

        # sconf.set("spark.python.profile", "true")
        self.sc = SparkContext(master=self.sm_config['spark']['master'],
                               conf=sconf,
                               appName='SM engine')
        if not self.sm_config['spark']['master'].startswith('local'):
            self.sc.addPyFile(join(local_path(proj_root()), 'sm.zip'))

    def _init_db(self):
        logger.info('Connecting to the DB')
        self.db = DB(self.sm_config['db'])
        self.sf_db_id = self.db.select_one(
            DB_ID_SEL, self.ds_config['database']['name'])[0]

    def store_job_meta(self):
        """ Store search job metadata in the database """
        logger.info('Storing job metadata')
        self.ds_id = int(self.db.select_one(DS_ID_SEL, self.ds_name)[0])
        self.job_id = self.ds_id
        self.db.alter(DEL_JOB_SQL, self.job_id)
        rows = [(self.job_id, self.sf_db_id, self.ds_id,
                 datetime.now().strftime('%Y-%m-%d %H:%M:%S'))]
        self.db.insert(JOB_INS, rows)

        rows = [(self.job_id, adduct)
                for adduct in self.ds_config['isotope_generation']['adducts']]
        self.db.insert(ADDUCT_INS, rows)

    def run(self, input_path, ds_config_path, clean=False):
        """ Entry point of the engine. Molecule search is completed in several steps:
         * Copying input data to the engine work dir
         * Conversion input data (imzML+ibd) to plain text format. One line - one spectrum data
         * Generation and saving to the database theoretical peaks for all formulas from the molecule database
         * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner.
         * Saving results (isotope images and their metrics of quality for each putative molecule) to the database

        Args
        -------
        input_path : string
            Path to the dataset folder with .imzML and .ibd files
        ds_config_path: string
            Path to the dataset config file
        clean : bool
            Clean all interim data files before starting molecule search
        """
        try:
            self.wd_manager = WorkDirManager(self.ds_name)
            if clean:
                self.wd_manager.clean()

            self.wd_manager.copy_input_data(input_path, ds_config_path)

            self._read_ds_config()
            logger.info('Dataset config:\n%s', pformat(self.ds_config))

            self._configure_spark()
            self._init_db()

            if not self.wd_manager.exists(self.wd_manager.txt_path):
                imzml_converter = ImzmlTxtConverter(
                    self.ds_name, self.wd_manager.local_dir.imzml_path,
                    self.wd_manager.local_dir.txt_path,
                    self.wd_manager.local_dir.coord_path)
                imzml_converter.convert()

                if not self.wd_manager.local_fs_only:
                    self.wd_manager.upload_to_remote()

            self.ds = Dataset(self.sc, self.ds_name, self.client_email,
                              input_path, self.ds_config, self.wd_manager,
                              self.db)
            self.ds.save_ds_meta()

            self.store_job_meta()

            theor_peaks_gen = TheorPeaksGenerator(self.sc, self.sm_config,
                                                  self.ds_config)
            theor_peaks_gen.run()

            target_adducts = self.ds_config['isotope_generation']['adducts']
            self.fdr = FDR(self.job_id,
                           self.sf_db_id,
                           decoy_sample_size=20,
                           target_adducts=target_adducts,
                           db=self.db)
            self.fdr.decoy_adduct_selection()
            self.formulas = FormulasSegm(self.job_id, self.sf_db_id,
                                         self.ds_config, self.db)

            # search_alg = MSMBasicSearch(self.sc, self.ds, self.formulas, self.fdr, self.ds_config)
            search_alg = MSMExtraFeats(self.sc, self.ds, self.formulas,
                                       self.fdr, self.ds_config)
            sf_metrics_df, sf_iso_images = search_alg.search()

            search_results = SearchResults(
                self.sf_db_id, self.ds_id, self.job_id, self.ds_name,
                self.formulas.get_sf_adduct_peaksn(), self.db, self.sm_config,
                self.ds_config)
            search_results.sf_metrics_df = sf_metrics_df
            search_results.sf_iso_images = sf_iso_images
            search_results.metrics = search_alg.metrics
            search_results.nrows, search_results.ncols = self.ds.get_dims()
            search_results.store()

            es = ESExporter(self.sm_config)
            es.index_ds(self.db, self.ds_name,
                        self.ds_config['database']['name'])

        except Exception:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            logger.error('\n'.join(
                traceback.format_exception(exc_type, exc_value,
                                           exc_traceback)))
        finally:
            if self.sc:
                # self.sc.show_profiles()
                self.sc.stop()
            if self.db:
                self.db.close()
def add_diagnostics(diagnostics: List[DatasetDiagnostic]):
    """Upserts dataset diagnostics, overwriting existing values with the same ds_id, job_id, type"""
    # Validate input, as postgres can't enforce the JSON columns have the correct schema,
    # and many places (graphql, python client, etc.) rely on these structures.

    if not diagnostics:
        return

    for diagnostic in diagnostics:
        assert 'ds_id' in diagnostic
        assert 'type' in diagnostic
        images = diagnostic.get('images', [])
        assert all(image['key'] in DiagnosticImageKey for image in images)
        assert all(image['format'] in DiagnosticImageFormat
                   for image in images)
        assert all(image['image_id'] in image['url'] for image in images)
        image_keys = set(
            (image.get('key'), image.get('index')) for image in images)
        assert len(image_keys) == len(
            images), 'diagnostic image keys should be unique'

    db = DB()
    # Find all diagnostics that should be replaced by the new diagnostics
    existing = db.select_with_fields(
        """
        WITH new_diagnostic AS (
            SELECT UNNEST(%s::text[]) as ds_id, UNNEST(%s::int[]) as job_id,
            UNNEST(%s::text[]) as type
        )
        SELECT dd.ds_id, dd.id, dd.images
        FROM new_diagnostic nd
        JOIN dataset_diagnostic dd ON nd.ds_id = dd.ds_id
            AND (nd.job_id = dd.job_id OR (nd.job_id IS NULL AND dd.job_id is NULL))
            AND nd.type = dd.type
        """,
        list(
            map(
                list,
                zip(*((d['ds_id'], d.get('job_id'), d['type'])
                      for d in diagnostics)))),
    )

    if existing:
        logger.debug(
            f'Deleting {len(existing)} existing diagnostics for dataset {existing[0]["ds_id"]}'
        )
        # Delete existing images
        image_ids_by_ds = defaultdict(list)
        for row in existing:
            for img in row['images'] or []:
                image_ids_by_ds[row['ds_id']].append(img['image_id'])
        for ds_id, image_ids in image_ids_by_ds.items():
            image_storage.delete_images(image_storage.DIAG, ds_id, image_ids)

        # Delete existing DB rows
        db.alter(
            'DELETE FROM dataset_diagnostic WHERE id = ANY(%s::uuid[])',
            ([row['id'] for row in existing], ),
        )

    logger.debug(
        f'Inserting {len(diagnostics)} diagnostics for dataset {diagnostics[0]["ds_id"]}'
    )
    db.insert(
        'INSERT INTO dataset_diagnostic (ds_id, job_id, type, updated_dt, data, error, images) '
        'VALUES (%s, %s, %s, %s, %s, %s, %s)',
        [(
            d['ds_id'],
            d.get('job_id'),
            d['type'],
            datetime.now(),
            numpy_json_dumps(d['data']) if d.get('data') is not None else None,
            d.get('error'),
            numpy_json_dumps(d.get('images', [])),
        ) for d in diagnostics],
    )
Exemple #18
0
def test_index_ds_works(sm_config, test_db, es, es_dsl_search, sm_index,
                        ds_config, metadata, annotation_stats):
    ds_id = '2000-01-01_00h00m'
    upload_dt = datetime.now().isoformat()
    last_finished = '2017-01-01 00:00:00'
    iso_image_ids = ['iso_img_id_1', 'iso_img_id_2']
    stats = json.dumps(annotation_stats)

    db = DB()
    db.insert(
        "INSERT INTO dataset(id, name, input_path, config, metadata, upload_dt, status, "
        "status_update_dt, is_public, acq_geometry, ion_thumbnail) "
        "VALUES (%s, 'ds_name', 'ds_input_path', %s, %s, %s, 'ds_status', %s, true, '{}', %s)",
        [[
            ds_id,
            json.dumps(ds_config),
            json.dumps(metadata), upload_dt, upload_dt, 'thumb-id'
        ]],
    )
    moldb = create_test_molecular_db()
    (job_id, ) = db.insert_return(
        "INSERT INTO job(ds_id, moldb_id, status, start, finish) "
        "VALUES (%s, %s, 'job_status', %s, %s) RETURNING id",
        rows=[(ds_id, moldb.id, last_finished, last_finished)],
    )
    (user_id, ) = db.insert_return(
        "INSERT INTO graphql.user (email, name, role) "
        "VALUES ('email', 'user_name', 'user') RETURNING id",
        [[]],
    )
    (group_id, ) = db.insert_return(
        "INSERT INTO graphql.group (name, short_name) VALUES ('group name', 'grp') RETURNING id",
        [[]],
    )
    db.insert(
        "INSERT INTO graphql.dataset(id, user_id, group_id) VALUES (%s, %s, %s)",
        [[ds_id, user_id, group_id]],
    )
    ion_id1, ion_id2 = db.insert_return(
        "INSERT INTO graphql.ion(ion, formula, chem_mod, neutral_loss, adduct, charge, ion_formula) "
        "VALUES (%s, %s, %s, %s, %s, %s, %s) RETURNING id",
        [
            ['H2O-H+O-H+H', 'H2O', '-H+O', '-H', '+H', 1, 'HO2'],
            ['Au+H', 'Au', '', '', '+H', 1, 'HAu'],
        ],
    )
    db.insert(
        "INSERT INTO annotation(job_id, formula, chem_mod, neutral_loss, adduct, "
        "msm, fdr, stats, iso_image_ids, ion_id) "
        "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
        [
            [
                job_id, 'H2O', '-H+O', '-H', '+H', 1, 0.1, stats,
                iso_image_ids, ion_id1
            ],
            [
                job_id, 'Au', '', '', '+H', 1, 0.05, stats, iso_image_ids,
                ion_id2
            ],
        ],
    )

    isocalc_mock = MagicMock(IsocalcWrapper)
    isocalc_mock.centroids = lambda formula: {
        'H2O+H': ([100.0, 200.0], None),
        'H2O-H+O-H+H': ([100.0, 200.0, 300.0], None),
        'Au+H': ([10.0, 20.0], None),
    }[formula]
    isocalc_mock.mass_accuracy_bounds = lambda mzs: (mzs, mzs)

    with patch(
            'sm.engine.es_export.molecular_db.fetch_molecules',
            return_value=pd.DataFrame(
                [('H2O', 'mol_id', 'mol_name'), ('Au', 'mol_id', 'mol_name')],
                columns=['formula', 'mol_id', 'mol_name'],
            ),
    ):
        es_exp = ESExporter(db, sm_config)
        es_exp.delete_ds(ds_id)
        es_exp.index_ds(
            ds_id=ds_id,
            moldb=moldb,
            isocalc=isocalc_mock,
        )

    wait_for_es(es, sm_config['elasticsearch']['index'])

    ds_d = (es_dsl_search.filter(
        'term',
        _type='dataset').execute().to_dict()['hits']['hits'][0]['_source'])
    expected_ds_fields = {
        'ds_last_finished': last_finished,
        'ds_config': ds_config,
        'ds_adducts': ds_config['isotope_generation']['adducts'],
        'ds_moldb_ids': ds_config['database_ids'],
        'ds_chem_mods': [],
        'ds_neutral_losses': [],
        'ds_project_ids': [],
        'ds_project_names': [],
        'ds_meta': metadata,
        'ds_status': 'ds_status',
        'ds_status_update_dt': upload_dt,
        'ds_name': 'ds_name',
        'ds_input_path': 'ds_input_path',
        'ds_id': ds_id,
        'ds_upload_dt': upload_dt,
        'ds_is_public': True,
        'ds_submitter_email': 'email',
        'ds_submitter_id': user_id,
        'ds_submitter_name': 'user_name',
        'ds_group_approved': False,
        'ds_group_id': group_id,
        'ds_group_name': 'group name',
        'ds_group_short_name': 'grp',
    }
    assert ds_d == {
        **expected_ds_fields,
        'ds_acq_geometry': {},
        'annotation_counts': [{
            'db': {
                'id': moldb.id,
                'name': moldb.name
            },
            'counts': [
                {
                    'level': 5,
                    'n': 1
                },
                {
                    'level': 10,
                    'n': 2
                },
                {
                    'level': 20,
                    'n': 2
                },
                {
                    'level': 50,
                    'n': 2
                },
            ],
        }],
    }
    ann_1_d = (es_dsl_search.filter(
        'term',
        formula='H2O').execute().to_dict()['hits']['hits'][0]['_source'])
    top_level_stats = {
        'pattern_match': annotation_stats['spectral'],
        'image_corr': annotation_stats['spatial'],
        'chaos': annotation_stats['chaos'],
        **{
            key: value
            for key, value in annotation_stats.items() if key in NON_METRIC_STATS
        },
    }
    metrics = {
        key: value
        for key, value in annotation_stats.items()
        if key not in NON_METRIC_STATS
    }
    assert ann_1_d == {
        **expected_ds_fields,
        **top_level_stats,
        'metrics':
        metrics,
        'fdr':
        0.1,
        'formula':
        'H2O',
        'msm':
        1.0,
        'ion':
        'H2O-H+O-H+H+',
        'ion_formula':
        'HO2',
        'centroid_mzs': [100.0, 200.0, 300.0],
        'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'],
        'iso_image_urls': [
            f'http://localhost:9000/{sm_config["image_storage"]["bucket"]}/iso/{ds_id}/iso_img_id_1',
            f'http://localhost:9000/{sm_config["image_storage"]["bucket"]}/iso/{ds_id}/iso_img_id_2',
        ],
        'isobars': [],
        'isomer_ions': [],
        'polarity':
        '+',
        'job_id':
        1,
        'adduct':
        '+H',
        'neutral_loss':
        '-H',
        'chem_mod':
        '-H+O',
        'annotation_counts': [],
        'comp_names': ['mol_name'],
        'comps_count_with_isomers':
        1,
        'db_id':
        moldb.id,
        'db_name':
        moldb.name,
        'db_version':
        moldb.version,
        'mz':
        100.0,
        'comp_ids': ['mol_id'],
        'annotation_id':
        1,
        'off_sample_label':
        None,
        'off_sample_prob':
        None,
    }
    ann_2_d = (es_dsl_search.filter(
        'term',
        formula='Au').execute().to_dict()['hits']['hits'][0]['_source'])
    assert ann_2_d == {
        **expected_ds_fields,
        **top_level_stats,
        'metrics':
        metrics,
        'fdr':
        0.05,
        'formula':
        'Au',
        'msm':
        1.0,
        'ion':
        'Au+H+',
        'ion_formula':
        'HAu',
        'centroid_mzs': [10.0, 20.0],
        'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'],
        'iso_image_urls': [
            f'http://localhost:9000/{sm_config["image_storage"]["bucket"]}/iso/{ds_id}/iso_img_id_1',
            f'http://localhost:9000/{sm_config["image_storage"]["bucket"]}/iso/{ds_id}/iso_img_id_2',
        ],
        'isobars': [],
        'isomer_ions': [],
        'polarity':
        '+',
        'job_id':
        1,
        'adduct':
        '+H',
        'neutral_loss':
        '',
        'chem_mod':
        '',
        'annotation_counts': [],
        'comp_names': ['mol_name'],
        'comps_count_with_isomers':
        1,
        'db_id':
        moldb.id,
        'db_name':
        moldb.name,
        'db_version':
        moldb.version,
        'mz':
        10.0,
        'comp_ids': ['mol_id'],
        'annotation_id':
        2,
        'off_sample_label':
        None,
        'off_sample_prob':
        None,
    }
def test_sm_daemons(get_compute_img_metrics_mock, filter_sf_metrics_mock,
                    post_images_to_annot_service_mock,
                    MolDBServiceWrapperMock,
                    sm_config, test_db, es_dsl_search, clean_isotope_storage):
    init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock)

    get_compute_img_metrics_mock.return_value = lambda *args: (0.9, 0.9, 0.9, [100.], [0], [10.])
    filter_sf_metrics_mock.side_effect = lambda x: x

    url_dict = {
        'iso_image_ids': ['iso_image_1', None, None, None]
    }
    post_images_to_annot_service_mock.return_value = {
        35: url_dict,
        44: url_dict
    }

    db = DB(sm_config['db'])
    es = ESExporter(db)
    annotate_daemon = None
    update_daemon = None

    try:
        ds_config_str = open(ds_config_path).read()
        upload_dt = datetime.now()
        ds_id = '2000-01-01_00h00m'
        db.insert(Dataset.DS_INSERT, [{
            'id': ds_id,
            'name': test_ds_name,
            'input_path': input_dir_path,
            'upload_dt': upload_dt,
            'metadata': '{"Data_Type": "Imaging MS"}',
            'config': ds_config_str,
            'status': DatasetStatus.QUEUED,
            'is_public': True,
            'mol_dbs': ['HMDB-v4'],
            'adducts': ['+H', '+Na', '+K'],
            'ion_img_storage': 'fs'
        }])

        ds = Dataset.load(db, ds_id)
        queue_pub.publish({'ds_id': ds.id, 'ds_name': ds.name, 'action': 'annotate'})

        run_daemons(db, es)

        # dataset table asserts
        rows = db.select('SELECT id, name, input_path, upload_dt, status from dataset')
        input_path = join(dirname(__file__), 'data', test_ds_name)
        assert len(rows) == 1
        assert rows[0] == (ds_id, test_ds_name, input_path, upload_dt, DatasetStatus.FINISHED)

        # ms acquisition geometry asserts
        rows = db.select('SELECT acq_geometry from dataset')
        assert len(rows) == 1
        assert rows[0][0] == ds.get_acq_geometry(db)
        assert rows[0][0] == {
            ACQ_GEOMETRY_KEYS.LENGTH_UNIT: 'nm',
            ACQ_GEOMETRY_KEYS.AcqGridSection.section_name: {
                ACQ_GEOMETRY_KEYS.AcqGridSection.REGULAR_GRID: True,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_X : 3,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_Y : 3,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_X : 100,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_Y : 100
            },
            ACQ_GEOMETRY_KEYS.PixelSizeSection.section_name: {
                ACQ_GEOMETRY_KEYS.PixelSizeSection.REGULAR_SIZE: True,
                ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_X : 100,
                ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_Y : 100
            }
        }

        # job table asserts
        rows = db.select('SELECT db_id, ds_id, status, start, finish from job')
        assert len(rows) == 1
        db_id, ds_id, status, start, finish = rows[0]
        assert (db_id, ds_id, status) == (0, '2000-01-01_00h00m', JobStatus.FINISHED)
        assert start < finish

        # image metrics asserts
        rows = db.select(('SELECT db_id, sf, adduct, stats, iso_image_ids '
                          'FROM iso_image_metrics '
                          'ORDER BY sf, adduct'))

        assert rows[0] == (0, 'C12H24O', '+K', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9,
                                                'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]},
                           ['iso_image_1', None, None, None])
        assert rows[1] == (0, 'C12H24O', '+Na', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9,
                                                 'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]},
                           ['iso_image_1', None, None, None])

        time.sleep(1)  # Waiting for ES
        # ES asserts
        ds_docs = es_dsl_search.query('term', _type='dataset').execute().to_dict()['hits']['hits']
        assert 1 == len(ds_docs)
        ann_docs = es_dsl_search.query('term', _type='annotation').execute().to_dict()['hits']['hits']
        assert len(ann_docs) == len(rows)
        for doc in ann_docs:
            assert doc['_id'].startswith(ds_id)

    finally:
        db.close()
        if annotate_daemon:
            annotate_daemon.stop()
        if update_daemon:
            update_daemon.stop()
        with warn_only():
            local('rm -rf {}'.format(data_dir_path))
def test_search_job_imzml_example(get_compute_img_metrics_mock, filter_sf_metrics_mock,
                                  post_images_to_annot_service_mock, MolDBServiceWrapperMock, MolDBServiceWrapperMock2,
                                  sm_config, create_fill_sm_database, es_dsl_search, clean_isotope_storage):
    init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock)
    init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock2)

    get_compute_img_metrics_mock.return_value = lambda *args: (0.9, 0.9, 0.9, [100.], [0], [10.])
    filter_sf_metrics_mock.side_effect = lambda x: x

    url_dict = {
        'iso_image_ids': ['iso_image_1', None, None, None]
    }
    post_images_to_annot_service_mock.return_value = {
        35: url_dict,
        44: url_dict
    }

    db = DB(sm_config['db'])

    try:
        ds_config_str = open(ds_config_path).read()
        upload_dt = datetime.now()
        ds_id = '2000-01-01_00h00m'
        db.insert(Dataset.DS_INSERT, [{
            'id': ds_id,
            'name': test_ds_name,
            'input_path': input_dir_path,
            'upload_dt': upload_dt,
            'metadata': '{}',
            'config': ds_config_str,
            'status': DatasetStatus.QUEUED,
            'is_public': True,
            'mol_dbs': ['HMDB-v4'],
            'adducts': ['+H', '+Na', '+K'],
            'ion_img_storage': 'fs'
        }])

        img_store = ImageStoreServiceWrapper(sm_config['services']['img_service_url'])
        job = SearchJob(img_store=img_store)
        job._sm_config['rabbitmq'] = {}  # avoid talking to RabbitMQ during the test
        ds = Dataset.load(db, ds_id)
        job.run(ds)

        # dataset table asserts
        rows = db.select('SELECT id, name, input_path, upload_dt, status from dataset')
        input_path = join(dirname(__file__), 'data', test_ds_name)
        assert len(rows) == 1
        assert rows[0] == (ds_id, test_ds_name, input_path, upload_dt, DatasetStatus.FINISHED)

        # ms acquisition geometry asserts
        rows = db.select('SELECT acq_geometry from dataset')
        assert len(rows) == 1
        assert rows[0][0] == ds.get_acq_geometry(db)
        assert rows[0][0] == {
            ACQ_GEOMETRY_KEYS.LENGTH_UNIT: 'nm',
            ACQ_GEOMETRY_KEYS.AcqGridSection.section_name: {
                ACQ_GEOMETRY_KEYS.AcqGridSection.REGULAR_GRID: True,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_X : 3,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_Y : 3,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_X : 100,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_Y : 100
            },
            ACQ_GEOMETRY_KEYS.PixelSizeSection.section_name: {
                ACQ_GEOMETRY_KEYS.PixelSizeSection.REGULAR_SIZE: True,
                ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_X : 100,
                ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_Y : 100
            }
        }

        # job table asserts
        rows = db.select('SELECT db_id, ds_id, status, start, finish from job')
        assert len(rows) == 1
        db_id, ds_id, status, start, finish = rows[0]
        assert (db_id, ds_id, status) == (0, '2000-01-01_00h00m', 'FINISHED')
        assert start < finish

        # image metrics asserts
        rows = db.select(('SELECT db_id, sf, adduct, stats, iso_image_ids '
                          'FROM iso_image_metrics '
                          'ORDER BY sf, adduct'))

        assert rows[0] == (0, 'C12H24O', '+K', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9,
                                                'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]},
                           ['iso_image_1', None, None, None])
        assert rows[1] == (0, 'C12H24O', '+Na', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9,
                                                 'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]},
                           ['iso_image_1', None, None, None])

        time.sleep(1)  # Waiting for ES
        # ES asserts
        ds_docs = es_dsl_search.query('term', _type='dataset').execute().to_dict()['hits']['hits']
        assert 1 == len(ds_docs)
        ann_docs = es_dsl_search.query('term', _type='annotation').execute().to_dict()['hits']['hits']
        assert len(ann_docs) == len(rows)
        for doc in ann_docs:
            assert doc['_id'].startswith(ds_id)

    finally:
        db.close()
        with warn_only():
            local('rm -rf {}'.format(data_dir_path))
Exemple #21
0
def fill_db(test_db, metadata, ds_config):
    upload_dt = '2000-01-01 00:00:00'
    ds_id = '2000-01-01'
    db = DB()
    db.insert(
        'INSERT INTO dataset ('
        '   id, name, input_path, upload_dt, metadata, config, status, status_update_dt, is_public'
        ') values (%s, %s, %s, %s, %s, %s, %s, %s, %s)',
        rows=[(
            ds_id,
            'ds_name',
            'input_path',
            upload_dt,
            json.dumps(metadata),
            json.dumps(ds_config),
            'FINISHED',
            upload_dt,
            True,
        )],
    )
    moldb = create_test_molecular_db()
    db.insert("INSERT INTO job (id, moldb_id, ds_id) VALUES (%s, %s, %s)",
              rows=[(0, moldb.id, ds_id)])
    db.insert(
        ("INSERT INTO annotation (job_id, formula, chem_mod, neutral_loss, adduct, "
         "msm, fdr, stats, iso_image_ids) VALUES (%s, %s, '', '', %s, 0.5, 0.2, '{}', %s)"
         ),
        rows=[
            (0, 'H2O', '+H', ['iso_image_11', 'iso_image_12']),
            (0, 'CH4', '+H', ['iso_image_21', 'iso_image_22']),
        ],
    )
    user_id = str(uuid.uuid4())
    db.insert(
        "INSERT INTO graphql.user (id, name, email) VALUES (%s, %s, %s)",
        rows=[(user_id, 'name', '*****@*****.**')],
    )
    group_id = str(uuid.uuid4())
    db.insert(
        "INSERT INTO graphql.group (id, name, short_name) VALUES (%s, %s, %s)",
        rows=[(group_id, 'group name', 'short name')],
    )
    db.insert(
        "INSERT INTO graphql.dataset (id, user_id, group_id) VALUES (%s, %s, %s)",
        rows=[('dataset id', user_id, group_id)],
    )

    return {"moldb": moldb}