コード例 #1
0
def run_off_sample(sm_config, ds_ids_str, sql_where, fix_missing, overwrite_existing):
    db = DB()

    ds_ids = None
    if ds_ids_str:
        ds_ids = ds_ids_str.split(',')
    elif sql_where:
        ds_ids = [
            id for (id,) in db.select(f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}')
        ]
    elif fix_missing:
        logger.info('Checking for missing off-sample jobs...')
        results = db.select(MISSING_OFF_SAMPLE_SEL)
        ds_ids = [ds_id for ds_id, in results]
        logger.info(f'Found {len(ds_ids)} missing off-sample sets')

    if not ds_ids:
        logger.warning('No datasets match filter')
        return

    es_exp = ESExporter(db, sm_config)
    for i, ds_id in enumerate(ds_ids):
        try:
            logger.info(f'Running off-sample on {i+1} out of {len(ds_ids)}')
            ds = Dataset.load(db, ds_id)
            classify_dataset_ion_images(db, ds, sm_config['services'], overwrite_existing)
            es_exp.reindex_ds(ds_id)
        except Exception:
            logger.error(f'Failed to run off-sample on {ds_id}', exc_info=True)
コード例 #2
0
def test_delete_ds__completely(es, sm_index, sm_config):
    index = sm_config['elasticsearch']['index']
    es.create(index=index, doc_type='annotation', id='id1',
              body={'ds_id': 'dataset1', 'db_name': 'HMDB', 'db_version': '2016'})
    es.create(index=index, doc_type='annotation', id='id2',
              body={'ds_id': 'dataset1', 'db_name': 'ChEBI', 'db_version': '2016'})
    es.create(index=index, doc_type='annotation', id='id3',
              body={'ds_id': 'dataset2', 'db_name': 'HMDB', 'db_version': '2016'})
    es.create(index=index, doc_type='dataset', id='dataset1',
              body={'ds_id': 'dataset1', 'db_name': 'HMDB', 'db_version': '2016'})

    wait_for_es(sec=1)

    db_mock = MagicMock(spec=DB)

    es_exporter = ESExporter(db_mock)
    es_exporter.delete_ds(ds_id='dataset1')

    wait_for_es(sec=1)

    body = {
        'query': {
            'bool': {
                'filter': []
            }
        }
    }
    body['query']['bool']['filter'] = [{'term': {'ds_id': 'dataset1'}}, {'term': {'db_name': 'HMDB'}}]
    assert es.count(index=index, doc_type='annotation', body=body)['count'] == 0
    body['query']['bool']['filter'] = [{'term': {'ds_id': 'dataset1'}}, {'term': {'db_name': 'ChEBI'}}]
    assert es.count(index=index, doc_type='annotation', body=body)['count'] == 0
    body['query']['bool']['filter'] = [{'term': {'ds_id': 'dataset2'}}, {'term': {'db_name': 'HMDB'}}]
    assert es.count(index=index, doc_type='annotation', body=body)['count'] == 1
    body['query']['bool']['filter'] = [{'term': {'ds_id': 'dataset1'}}, {'term': {'_type': 'dataset'}}]
    assert es.count(index=index, doc_type='dataset', body=body)['count'] == 0
コード例 #3
0
def test_delete_ds__completely(es, sm_index, sm_config):
    index = sm_config['elasticsearch']['index']
    es.create(index=index, doc_type='annotation', id='id1',
              body={'ds_id': 'dataset1', 'db_name': 'HMDB', 'db_version': '2016'})
    es.create(index=index, doc_type='annotation', id='id2',
              body={'ds_id': 'dataset1', 'db_name': 'ChEBI', 'db_version': '2016'})
    es.create(index=index, doc_type='annotation', id='id3',
              body={'ds_id': 'dataset2', 'db_name': 'HMDB', 'db_version': '2016'})
    es.create(index=index, doc_type='dataset', id='dataset1',
              body={'ds_id': 'dataset1', 'db_name': 'HMDB', 'db_version': '2016'})

    wait_for_es(sec=1)

    db_mock = MagicMock(spec=DB)

    es_exporter = ESExporter(db_mock)
    es_exporter.delete_ds(ds_id='dataset1')

    wait_for_es(sec=1)

    body = {
        'query': {
            'bool': {
                'filter': []
            }
        }
    }
    body['query']['bool']['filter'] = [{'term': {'ds_id': 'dataset1'}}, {'term': {'db_name': 'HMDB'}}]
    assert es.count(index=index, doc_type='annotation', body=body)['count'] == 0
    body['query']['bool']['filter'] = [{'term': {'ds_id': 'dataset1'}}, {'term': {'db_name': 'ChEBI'}}]
    assert es.count(index=index, doc_type='annotation', body=body)['count'] == 0
    body['query']['bool']['filter'] = [{'term': {'ds_id': 'dataset2'}}, {'term': {'db_name': 'HMDB'}}]
    assert es.count(index=index, doc_type='annotation', body=body)['count'] == 1
    body['query']['bool']['filter'] = [{'term': {'ds_id': 'dataset1'}}, {'term': {'_type': 'dataset'}}]
    assert es.count(index=index, doc_type='dataset', body=body)['count'] == 0
コード例 #4
0
def test_foo(sm_config):
    annotations = [('test_ds', 'test_db', 'H20', '+H', [], []), ('test_ds', 'test_db', 'Au', '+H', [], [])]
    db_mock = MagicMock(DB)
    db_mock.select.return_value = annotations

    es_exp = ESExporter(sm_config)
    es_exp.index_ds(db_mock, 'test_ds', 'test_db')

    es = Elasticsearch()

    d = es.get(index='sm', id='test_ds_test_db_H20_+H', doc_type='annotation', _source=True)
    assert d['_source'] == {'ds_name': 'test_ds', 'db_name': 'test_db', 'sf': 'H20', 'adduct': '+H', 'comp_names': '', 'comp_ids': ''}

    d = es.get(index='sm', id='test_ds_test_db_Au_+H', doc_type='annotation', _source=True)
    assert d['_source'] == {'ds_name': 'test_ds', 'db_name': 'test_db', 'sf': 'Au', 'adduct': '+H', 'comp_names': '', 'comp_ids': ''}
コード例 #5
0
def reindex_results(sm_config, ds_id, ds_mask, use_inactive_index,
                    offline_reindex, update_fields):
    assert ds_id or ds_mask or offline_reindex

    IsocalcWrapper.set_centroids_cache_enabled(True)

    if offline_reindex:
        _reindex_all(sm_config)
    else:
        es_config = sm_config['elasticsearch']
        if use_inactive_index:
            es_config = get_inactive_index_es_config(es_config)

        db = DB()
        es_exp = ESExporter(db,
                            sm_config={
                                **sm_config, 'elasticsearch': es_config
                            })

        if ds_id:
            ds_ids = ds_id.split(',')
        elif ds_mask:
            ds_ids = [
                id for (id, ) in db.select(
                    "select id from dataset where name like '{}%'".format(
                        ds_mask))
            ]
        else:
            ds_ids = []

        if update_fields:
            _partial_update_datasets(ds_ids, es_exp, update_fields.split(','))
        else:
            _reindex_datasets(ds_ids, es_exp)
コード例 #6
0
def migrate_isotopic_images(ds_id):
    output.print('Migrating isotopic images')

    db = DB()
    image_ids = db.select_onecol(SEL_DS_IMG_IDS, params=(ds_id,))
    es_exporter = ESExporter(db, sm_config)
    if image_ids and not _es_docs_migrated(es_exporter._es, ds_id):

        with timeit():
            output.print('Transferring images...')
            output.print(len(image_ids))
            transfer_images(ds_id, 'iso_images', image_storage.ISO, image_ids)

        with timeit():
            output.print('Reindexing ES documents...')
            es_exporter.reindex_ds(ds_id)
コード例 #7
0
def test_foo(sm_config):
    annotations = [('test_ds', 'test_db', 'H20', '+H', [], [], 100), ('test_ds', 'test_db', 'Au', '+H', [], [], 200)]
    db_mock = MagicMock(DB)
    db_mock.select.return_value = annotations

    es_exp = ESExporter(sm_config)
    es_exp.index_ds(db_mock, 'test_ds', 'test_db')

    es = Elasticsearch()

    d = es.get(index='sm', id='test_ds_test_db_H20_+H', doc_type='annotation', _source=True)
    assert d['_source'] == {'ds_name': 'test_ds', 'db_name': 'test_db', 'sf': 'H20', 'adduct': '+H',
                            'comp_names': '', 'comp_ids': '', 'mz': '00100.0000'}

    d = es.get(index='sm', id='test_ds_test_db_Au_+H', doc_type='annotation', _source=True)
    assert d['_source'] == {'ds_name': 'test_ds', 'db_name': 'test_db', 'sf': 'Au', 'adduct': '+H',
                            'comp_names': '', 'comp_ids': '', 'mz': '00200.0000'}
コード例 #8
0
def test_sm_daemons_annot_fails(get_compute_img_metrics_mock, filter_sf_metrics_mock,
                                post_images_to_annot_service_mock,
                                MolDBServiceWrapperMock,
                                sm_config, test_db, es_dsl_search,
                                clean_isotope_storage):
    init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock)

    def throw_exception_function(*args):
        raise Exception('Test')
    get_compute_img_metrics_mock.return_value = throw_exception_function
    filter_sf_metrics_mock.side_effect = lambda x: x

    url_dict = {
        'iso_image_ids': ['iso_image_1', None, None, None]
    }
    post_images_to_annot_service_mock.return_value = {
        35: url_dict,
        44: url_dict
    }

    db = DB(sm_config['db'])
    es = ESExporter(db)
    annotate_daemon = None

    try:
        ds_id = '2000-01-01_00h00m'
        upload_dt = datetime.now()
        ds_config_str = open(ds_config_path).read()
        db.insert(Dataset.DS_INSERT, [{
            'id': ds_id,
            'name': test_ds_name,
            'input_path': input_dir_path,
            'upload_dt': upload_dt,
            'metadata': '{}',
            'config': ds_config_str,
            'status': DatasetStatus.QUEUED,
            'is_public': True,
            'mol_dbs': ['HMDB-v4'],
            'adducts': ['+H', '+Na', '+K'],
            'ion_img_storage': 'fs'
        }])

        ds = Dataset.load(db, ds_id)
        queue_pub.publish({'ds_id': ds.id, 'ds_name': ds.name, 'action': 'annotate'})

        run_daemons(db, es)

        # dataset and job tables asserts
        row = db.select_one('SELECT status from dataset')
        assert row[0] == 'FAILED'
        row = db.select_one('SELECT status from job')
        assert row[0] == 'FAILED'
    finally:
        db.close()
        if annotate_daemon:
            annotate_daemon.stop()
        with warn_only():
            local('rm -rf {}'.format(data_dir_path))
コード例 #9
0
def reindex_all_results(conf):
    db = DB(conf['db'])
    es_exp = ESExporter(conf)

    es_exp.delete_index(name='sm')
    es_exp.create_index(name='sm')

    ds_db_pairs = db.select(
        "select name, config -> 'database'::text -> 'name'::text from dataset")

    for ds_name, db_name in ds_db_pairs:
        es_exp.index_ds(db, ds_name, db_name)
コード例 #10
0
ファイル: datasets.py プロジェクト: metaspace2020/metaspace
def _create_dataset_manager(db):
    return SMapiDatasetManager(
        db=db,
        es=ESExporter(db, sm_config),
        annot_queue=_create_queue_publisher(SM_ANNOTATE),
        update_queue=_create_queue_publisher(SM_UPDATE),
        lit_queue=_create_queue_publisher(SM_LITHOPS),
        status_queue=_create_queue_publisher(SM_DS_STATUS),
        logger=logger,
    )
コード例 #11
0
ファイル: job.py プロジェクト: metaspace2020/metaspace
def del_jobs(ds: Dataset, moldb_ids: Optional[Iterable[int]] = None):
    """
    Delete a dataset's jobs for the specified moldbs, or all jobs if moldb_ids is None.
    Also cleans up the annotations from ElasticSearch and deletes their ion images.
    """
    db = DB()
    es = ESExporter(db)

    if moldb_ids is None:
        moldb_ids = get_ds_moldb_ids(ds.id)
    moldbs = molecular_db.find_by_ids(moldb_ids)

    job_ids = DB().select_onecol(
        'SELECT j.id FROM job j WHERE ds_id = %s AND moldb_id = ANY(%s)',
        (ds.id, list(moldb_ids)))
    del_diagnostics(ds.id, job_ids)

    for moldb in moldbs:
        logger.info(
            f'Deleting isotopic images: ds_id={ds.id} ds_name={ds.name} moldb={moldb}'
        )
        img_id_rows = db.select_onecol(
            'SELECT iso_image_ids '
            'FROM annotation m '
            'JOIN job j ON j.id = m.job_id '
            'JOIN dataset d ON d.id = j.ds_id '
            'WHERE ds_id = %s AND j.moldb_id = %s',
            (ds.id, moldb.id),
        )

        image_ids = [
            img_id for img_ids in img_id_rows for img_id in img_ids
            if img_id is not None
        ]
        image_storage.delete_images(image_storage.ISO, ds.id, image_ids)

        logger.info(
            f"Deleting job results: ds_id={ds.id} ds_name={ds.name} moldb={moldb}"
        )
        db.alter('DELETE FROM job WHERE ds_id = %s and moldb_id = %s',
                 (ds.id, moldb.id))
        es.delete_ds(ds.id, moldb)
コード例 #12
0
def get_manager():
    db = DB()
    status_queue_pub = QueuePublisher(config=sm_config['rabbitmq'],
                                      qdesc=SM_DS_STATUS,
                                      logger=logger)
    return DatasetManager(
        db=db,
        es=ESExporter(db, sm_config),
        status_queue=status_queue_pub,
        logger=logger,
    )
コード例 #13
0
ファイル: api.py プロジェクト: metaspace2020/offsample
def _create_dataset_manager(db):
    config = SMConfig.get_conf()
    img_store = ImageStoreServiceWrapper(config['services']['img_service_url'])
    img_store.storage_type = 'fs'
    return SMapiDatasetManager(
        db=db,
        es=ESExporter(db),
        image_store=img_store,
        annot_queue=_create_queue_publisher(SM_ANNOTATE),
        update_queue=_create_queue_publisher(SM_UPDATE),
        status_queue=_create_queue_publisher(SM_DS_STATUS),
        logger=logger)
コード例 #14
0
 def __init__(
     self,
     ds: Dataset,
     perf: Profiler,
     sm_config: Optional[Dict] = None,
 ):
     self._sm_config = sm_config or SMConfig.get_conf()
     self._sc = None
     self._db = DB()
     self._ds = ds
     self._perf = perf
     self._es = ESExporter(self._db, self._sm_config)
     self._ds_data_path = None
コード例 #15
0
    def run_search(self, mock_img_store=False):
        if mock_img_store:
            img_store = self._create_img_store_mock()
        else:
            img_store = ImageStoreServiceWrapper(
                self.sm_config['services']['img_service_url'])
        manager = SMDaemonManager(db=self.db,
                                  es=ESExporter(self.db),
                                  img_store=img_store)

        ds = create_ds_from_files(self.ds_id, self.ds_name, self.input_path)
        from sm.engine.search_job import SearchJob
        manager.annotate(ds, search_job_factory=SearchJob, del_first=True)
コード例 #16
0
def run_search(sm_config):
    db = DB()
    manager = DatasetManager(db, ESExporter(db, sm_config))

    config_path = args.config_path or Path(args.input_path) / 'config.json'
    meta_path = args.meta_path or Path(args.input_path) / 'meta.json'

    ds = create_ds_from_files(args.ds_id, args.ds_name, args.input_path,
                              config_path, meta_path)
    if args.use_lithops:
        manager.annotate_lithops(ds, del_first=True)
    else:
        manager.annotate(ds, del_first=True)
コード例 #17
0
def reindex_all_results(conf):
    db = DB(conf['db'])
    es_exp = ESExporter(conf)

    es_exp.delete_index(name='sm')
    es_exp.create_index(name='sm')

    ds_db_pairs = db.select("select name, config -> 'database'::text -> 'name'::text from dataset")

    for ds_name, db_name in ds_db_pairs:
        es_exp.index_ds(db, ds_name, db_name)
コード例 #18
0
def test_sm_daemons_annot_fails(
    MSMSearchMock,
    post_images_to_image_store_mock,
    test_db,
    es_dsl_search,
    clean_isotope_storage,
    reset_queues,
    metadata,
    ds_config,
    queue_pub,
    local_sm_config,
):
    moldb = init_moldb()

    def throw_exception_function(*args, **kwargs):
        raise Exception('Test exception')

    msm_algo_mock = MSMSearchMock()
    msm_algo_mock.search.side_effect = throw_exception_function

    image_ids = ['iso_image_1', None, None, None]
    post_images_to_image_store_mock.return_value = {
        0: image_ids,
        1: image_ids,
        2: image_ids
    }

    db = DB()
    es = ESExporter(db, local_sm_config)
    ds = create_test_ds(
        name=test_ds_name,
        input_path=input_dir_path,
        config={
            **ds_config, 'database_ids': [moldb.id]
        },
        status=DatasetStatus.QUEUED,
        es=es,
    )

    queue_pub.publish({
        'ds_id': ds.id,
        'ds_name': test_ds_name,
        'action': DaemonAction.ANNOTATE
    })

    run_daemons(db, es, local_sm_config)

    # dataset and job tables asserts
    row = db.select_one('SELECT status from dataset')
    assert len(row) == 1
    assert row[0] == 'FAILED'
コード例 #19
0
    def __init__(
        self,
        executor: Executor,
        ds: Dataset,
        perf: Profiler,
        sm_config: Optional[Dict] = None,
        use_cache=False,
        store_images=True,
    ):
        """
        Args
        ========

        use_cache: For development - cache the results after each pipeline step so that it's easier
                   to quickly re-run specific steps.
        """
        sm_config = sm_config or SMConfig.get_conf()
        self.sm_storage = sm_config['lithops']['sm_storage']
        self.storage = Storage(sm_config['lithops'])
        self.s3_client = get_s3_client()
        self.ds = ds
        self.perf = perf
        self.store_images = store_images
        self.db = DB()
        self.es = ESExporter(self.db, sm_config)
        self.imzml_cobj, self.ibd_cobj = _upload_imzmls_from_prefix_if_needed(
            self.ds.input_path, self.storage, self.sm_storage, self.s3_client)
        self.moldb_defs = _upload_moldbs_from_db(
            self.ds.config['database_ids'], self.storage, self.sm_storage)

        if use_cache:
            cache_key: Optional[str] = jsonhash({
                'input_path': ds.input_path,
                'ds': ds.config
            })
        else:
            cache_key = None

        self.pipe = Pipeline(
            self.imzml_cobj,
            self.ibd_cobj,
            self.moldb_defs,
            self.ds.config,
            cache_key=cache_key,
            executor=executor,
        )

        self.results_dfs = None
        self.png_cobjs = None
        self.db_formula_image_ids = None
コード例 #20
0
def test_add_isomer_fields_to_anns():
    ann_docs = [
        {
            'ion': 'H2O+H-H-',
            'ion_formula': 'H2O',
            'comp_ids': ['1']
        },
        {
            'ion': 'H3O-H-',
            'ion_formula': 'H2O',
            'comp_ids': ['2', '3']
        },
        {
            'ion': 'H3O+CO2-CO2-H-',
            'ion_formula': 'H2O',
            'comp_ids': ['2', '3', '4']
        },
        {
            'ion': 'H2O-H-',
            'ion_formula': 'H1O',
            'comp_ids': ['4']
        },
    ]

    ESExporter._add_isomer_fields_to_anns(ann_docs)

    isomer_ions_fields = [doc['isomer_ions'] for doc in ann_docs]
    comps_count_fields = [doc['comps_count_with_isomers'] for doc in ann_docs]
    assert isomer_ions_fields == [
        ['H3O-H-', 'H3O+CO2-CO2-H-'],
        ['H2O+H-H-', 'H3O+CO2-CO2-H-'],
        ['H2O+H-H-', 'H3O-H-'],
        [],
    ]

    assert comps_count_fields == [4, 4, 4, 1]
コード例 #21
0
def set_metadata_thumbnail(db, config, ds_name):
    ds_thumb_query = 'SELECT id, transform, thumbnail from dataset {}'.format(
        'WHERE name = %s' if ds_name != ALL_DS_MASK else '')
    for id, transform, thumbnail in db.select(
            ds_thumb_query, params=(ds_name, ) if ds_name else None):
        if transform != None:
            ds = api.Dataset.load(db=db, ds_id=id)
            img_store = ImageStoreServiceWrapper(
                config['services']['img_service_url'])
            img_store.storage_type = 'fs'
            sm = SMapiDatasetManager(db=db,
                                     es=ESExporter(db),
                                     image_store=img_store,
                                     mode='queue')
            ds_opt_img_query = 'SELECT optical_image from dataset {}'.format(
                'WHERE id = %s')
            img_id = db.select(ds_opt_img_query, params=(ds.id, ))
            sm._add_thumbnail_optical_image(ds, f"{img_id[0][0]}", transform)
コード例 #22
0
def _reindex_all(conf):
    es_config = conf['elasticsearch']
    alias = es_config['index']
    es_man = ESIndexManager(es_config)
    new_index = es_man.another_index_name(es_man.internal_index_name(alias))
    es_man.create_index(new_index)

    try:
        tmp_es_config = deepcopy(es_config)
        tmp_es_config['index'] = new_index

        db = DB(conf['db'])
        es_exp = ESExporter(db, tmp_es_config)
        rows = db.select('select id, name, config from dataset')
        _reindex_datasets(rows, es_exp)

        es_man.remap_alias(tmp_es_config['index'], alias=alias)
    except Exception as e:
        es_man.delete_index(new_index)
        raise e
コード例 #23
0
def reindex_results(ds_id, ds_mask):
    assert ds_id or ds_mask

    conf = SMConfig.get_conf()
    if ds_mask == '_all_':
        _reindex_all(conf)
    else:
        db = DB(conf['db'])
        es_exp = ESExporter(db)

        if ds_id:
            rows = db.select(
                "select id, name, config from dataset where id = '{}'".format(
                    ds_id))
        elif ds_mask:
            rows = db.select(
                "select id, name, config from dataset where name like '{}%'".
                format(ds_mask))
        else:
            rows = []

        _reindex_datasets(rows, es_exp)
コード例 #24
0
def _reindex_all(sm_config):
    es_config = sm_config['elasticsearch']
    alias = es_config['index']
    es_man = ESIndexManager(es_config)
    old_index = es_man.internal_index_name(alias)
    new_index = es_man.another_index_name(old_index)
    es_man.create_index(new_index)

    try:
        inactive_es_config = get_inactive_index_es_config(es_config)
        db = DB()
        es_exp = ESExporter(db, {
            **sm_config, 'elasticsearch': inactive_es_config
        })
        ds_ids = [r[0] for r in db.select('select id from dataset')]
        _reindex_datasets(ds_ids, es_exp)

        es_man.remap_alias(inactive_es_config['index'], alias=alias)
    except Exception as e:
        es_man.delete_index(new_index)
        raise e
    else:
        es_man.delete_index(old_index)
コード例 #25
0
def test_update_ds_works_for_all_fields(sm_config, test_db, es, sm_index,
                                        es_dsl_search):
    update = {
        'name': 'new_ds_name',
        'submitter_id': 'new_ds_submitter_id',
        'group_id': 'new_ds_group_id',
        'projects_ids': ['proj_id1', 'proj_id2'],
        'is_public': True,
    }

    index = sm_config['elasticsearch']['index']
    es.create(
        index=index,
        doc_type='annotation',
        id='id1',
        body={
            'ds_id': 'dataset1',
            'ds_name': 'ds_name',
            'ds_submitter_id': 'ds_submitter',
            'ds_group_id': 'ds_group_id',
            'ds_project_ids': [],
            'ds_is_public': False,
        },
    )
    es.create(
        index=index,
        doc_type='dataset',
        id='dataset1',
        body={
            'ds_id': 'dataset1',
            'ds_name': 'ds_name',
            'ds_submitter_id': 'ds_submitter_id',
            'ds_group_id': 'ds_group_id',
            'ds_projects_ids': [],
            'ds_is_public': False,
        },
    )
    wait_for_es(es, index)

    db_mock = MagicMock(spec=DB)
    db_mock.select_with_fields.return_value = [{
        'ds_name':
        'new_ds_name',
        'ds_submitter_id':
        'new_ds_submitter_id',
        'ds_submitter_name':
        'submitter_name',
        'ds_submitter_email':
        'submitter_email',
        'ds_group_id':
        'new_ds_group_id',
        'ds_group_name':
        'group_name',
        'ds_group_approved':
        True,
        'ds_group_short_name':
        'group_short_name',
        'ds_projects_ids': ['proj_id1', 'proj_id2'],
        'ds_is_public':
        True,
    }]

    es_exporter = ESExporter(db_mock, sm_config)
    es_exporter.update_ds('dataset1', fields=list(update.keys()))
    wait_for_es(es, index)

    ds_doc = (es_dsl_search.filter(
        'term',
        _type='dataset').execute().to_dict()['hits']['hits'][0]['_source'])
    for k, v in update.items():
        assert v == ds_doc[f'ds_{k}']

    ann_doc = (es_dsl_search.filter(
        'term',
        _type='annotation').execute().to_dict()['hits']['hits'][0]['_source'])
    for k, v in update.items():
        assert v == ann_doc[f'ds_{k}']
コード例 #26
0
def test_index_ds_works(es_dsl_search, sm_index, sm_config):
    ds_id = '2000-01-01_00h00m'
    upload_dt = datetime.now().isoformat(' ')
    mol_db_id = 0
    last_finished = '2017-01-01T00:00:00'

    def db_sel_side_effect(sql, params):
        if sql == DATASET_SEL:
            return [{
                'ds_id': ds_id,
                'ds_name': 'ds_name',
                'ds_input_path': 'ds_input_path',
                'ds_config': 'ds_config',
                'ds_meta': {},
                'ds_upload_dt': upload_dt,
                'ds_status': 'ds_status',
                'ds_last_finished': datetime.strptime(last_finished, '%Y-%m-%dT%H:%M:%S'),
                'ds_is_public': True,
                'ds_ion_img_storage': 'fs',
                'ds_acq_geometry': {}
            }]
        elif sql == ANNOTATIONS_SEL:
            return [{
                'sf': 'H2O',
                'sf_adduct': 'H2O+H',
                'chaos': 1,
                'image_corr': 1,
                'pattern_match': 1,
                'total_iso_ints': 100,
                'min_iso_ints': 0,
                'max_iso_ints': 100,
                'msm': 1,
                'adduct': '+H',
                'job_id': 1,
                'fdr': 0.1,
                'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'],
                'polarity': '+'
            }, {
                'sf': 'Au',
                'sf_adduct': 'Au+H',
                'chaos': 1,
                'image_corr': 1,
                'pattern_match': 1,
                'total_iso_ints': 100,
                'min_iso_ints': 0,
                'max_iso_ints': 100,
                'msm': 1,
                'adduct': '+H',
                'job_id': 1,
                'fdr': 0.05,
                'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'],
                'polarity': '+'
            }]
        else:
            logging.getLogger('engine').error('Wrong db_sel_side_effect arguments: ', args)

    db_mock = MagicMock(spec=DB)
    db_mock.select_with_fields.side_effect = db_sel_side_effect

    mol_db_mock = MagicMock(MolecularDB)
    mol_db_mock.id = mol_db_id
    mol_db_mock.name = 'db_name'
    mol_db_mock.version = '2017'
    mol_db_mock.get_molecules.return_value = pd.DataFrame([('H2O', 'mol_id', 'mol_name'), ('Au', 'mol_id', 'mol_name')],
                                                          columns=['sf', 'mol_id', 'mol_name'])

    isocalc_mock = MagicMock(IsocalcWrapper)
    isocalc_mock.ion_centroids = lambda sf, adduct: {
        ('H2O', '+H'): ([100., 200.], None),
        ('Au', '+H'): ([10., 20.], None)
    }[(sf, adduct)]

    es_exp = ESExporter(db_mock)
    es_exp.delete_ds(ds_id)
    es_exp.index_ds(ds_id=ds_id, mol_db=mol_db_mock, isocalc=isocalc_mock)

    wait_for_es(sec=1)

    ds_d = es_dsl_search.filter('term', _type='dataset').execute().to_dict()['hits']['hits'][0]['_source']
    assert ds_d == {
        'ds_last_finished': last_finished, 'ds_config': 'ds_config', 'ds_meta': {},
        'ds_status': 'ds_status', 'ds_name': 'ds_name', 'ds_input_path': 'ds_input_path', 'ds_id': ds_id,
        'ds_upload_dt': upload_dt,
        'annotation_counts': [{'db': {'name': 'db_name', 'version': '2017'},
                               'counts': [{'level': 5, 'n': 1}, {'level': 10, 'n': 2},
                                          {'level': 20, 'n': 2}, {'level': 50, 'n': 2}]}],
        'ds_is_public': True,
        'ds_acq_geometry': {},
        'ds_ion_img_storage': 'fs'
    }
    ann_1_d = es_dsl_search.filter('term', sf='H2O').execute().to_dict()['hits']['hits'][0]['_source']
    assert ann_1_d == {
        'pattern_match': 1, 'image_corr': 1, 'fdr': 0.1, 'chaos': 1, 'sf': 'H2O', 'min_iso_ints': 0,
        'msm': 1, 'sf_adduct': 'H2O+H', 'total_iso_ints': 100, 'centroid_mzs': [100., 200.],
        'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'], 'polarity': '+', 'job_id': 1, 'max_iso_ints': 100,
        'adduct': '+H', 'ds_name': 'ds_name', 'annotation_counts': [], 'db_version': '2017', 'ds_status': 'ds_status',
        'ion_add_pol': '[M+H]+', 'comp_names': ['mol_name'], 'db_name': 'db_name', 'mz': 100., 'ds_meta': {},
        'comp_ids': ['mol_id'], 'ds_config': 'ds_config', 'ds_input_path': 'ds_input_path', 'ds_id': ds_id,
        'ds_upload_dt': upload_dt, 'ds_last_finished': last_finished,
        'ds_ion_img_storage': 'fs', 'ds_is_public': True
    }
    ann_2_d = es_dsl_search.filter('term', sf='Au').execute().to_dict()['hits']['hits'][0]['_source']
    assert ann_2_d == {
        'pattern_match': 1, 'image_corr': 1, 'fdr': 0.05, 'chaos': 1, 'sf': 'Au', 'min_iso_ints': 0,
        'msm': 1, 'sf_adduct': 'Au+H', 'total_iso_ints': 100, 'centroid_mzs': [10., 20.],
        'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'], 'polarity': '+', 'job_id': 1, 'max_iso_ints': 100,
        'adduct': '+H',  'ds_name': 'ds_name', 'annotation_counts': [], 'db_version': '2017', 'ds_status': 'ds_status',
        'ion_add_pol': '[M+H]+', 'comp_names': ['mol_name'], 'db_name': 'db_name', 'mz': 10., 'ds_meta': {},
        'comp_ids': ['mol_id'], 'ds_config': 'ds_config', 'ds_input_path': 'ds_input_path', 'ds_id': ds_id,
        'ds_upload_dt': upload_dt, 'ds_last_finished': last_finished,
        'ds_ion_img_storage': 'fs', 'ds_is_public': True
    }
コード例 #27
0
ファイル: search_job.py プロジェクト: metaspace2020/offsample
    def run(self, ds):
        """ Entry point of the engine. Molecule search is completed in several steps:
            * Copying input data to the engine work dir
            * Conversion input mass spec files to plain text format. One line - one spectrum data
            * Generation and saving to the database theoretical peaks for all formulas from the molecule database
            * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner.
            * Saving results (isotope images and their metrics of quality for each putative molecule) to the database

        Args
        ----
            ds : sm.engine.dataset_manager.Dataset
        """
        try:
            logger.info('*' * 150)
            start = time.time()

            self._init_db()
            self._es = ESExporter(self._db)
            self._ds = ds

            if self._sm_config['rabbitmq']:
                self._status_queue = QueuePublisher(
                    config=self._sm_config['rabbitmq'],
                    qdesc=SM_DS_STATUS,
                    logger=logger)
            else:
                self._status_queue = None

            self._wd_manager = WorkDirManager(ds.id)
            self._configure_spark()

            if not self.no_clean:
                self._wd_manager.clean()

            self._ds_reader = DatasetReader(self._ds.input_path, self._sc,
                                            self._wd_manager)
            self._ds_reader.copy_convert_input_data()

            self._save_data_from_raw_ms_file()
            self._img_store.storage_type = self._ds.get_ion_img_storage_type(
                self._db)

            logger.info('Dataset config:\n%s', pformat(self._ds.config))

            completed_moldb_ids, new_moldb_ids = self._moldb_ids()
            for moldb_id in completed_moldb_ids.symmetric_difference(
                    new_moldb_ids):  # ignore ids present in both sets
                mol_db = MolecularDB(
                    id=moldb_id,
                    db=self._db,
                    iso_gen_config=self._ds.config['isotope_generation'])
                if moldb_id not in new_moldb_ids:
                    self._remove_annotation_job(mol_db)
                elif moldb_id not in completed_moldb_ids:
                    self._run_annotation_job(mol_db)

            logger.info("All done!")
            time_spent = time.time() - start
            logger.info('Time spent: %d mins %d secs',
                        *divmod(int(round(time_spent)), 60))
        finally:
            if self._sc:
                self._sc.stop()
            if self._db:
                self._db.close()
            if self._wd_manager and not self.no_clean:
                self._wd_manager.clean()
            logger.info('*' * 150)
コード例 #28
0
                        type=str,
                        help='SM config path')
    args = parser.parse_args()

    SMConfig.set_path(args.config_path)
    sm_config = SMConfig.get_conf()
    init_loggers(sm_config['logs'])
    logger = logging.getLogger(f'{args.name}-daemon')
    logger.info(f'Starting {args.name}-daemon')

    db = DB(sm_config['db'])
    status_queue_pub = QueuePublisher(config=sm_config['rabbitmq'],
                                      qdesc=SM_DS_STATUS,
                                      logger=logger)
    manager = SMDaemonManager(db=db,
                              es=ESExporter(db),
                              img_store=ImageStoreServiceWrapper(
                                  sm_config['services']['img_service_url']),
                              status_queue=status_queue_pub,
                              logger=logger)
    if args.name == 'annotate':
        daemon = SMAnnotateDaemon(manager=manager,
                                  annot_qdesc=SM_ANNOTATE,
                                  upd_qdesc=SM_UPDATE)
    elif args.name == 'update':
        daemon = SMUpdateDaemon(manager=manager, update_qdesc=SM_UPDATE)
    else:
        raise Exception(f'Wrong SM daemon name: {args.name}')

    signal.signal(signal.SIGINT, lambda *args: daemon.stop())
    signal.signal(signal.SIGTERM, lambda *args: daemon.stop())
コード例 #29
0
def test_index_ds_works(es_dsl_search, sm_index, sm_config):
    ds_id = '2000-01-01_00h00m'
    upload_dt = datetime.now().isoformat(' ')
    mol_db_id = 0
    last_finished = '2017-01-01T00:00:00'

    def db_sel_side_effect(sql, params):
        if sql == DATASET_SEL:
            return [{
                'ds_id': ds_id,
                'ds_name': 'ds_name',
                'ds_input_path': 'ds_input_path',
                'ds_config': 'ds_config',
                'ds_meta': {},
                'ds_upload_dt': upload_dt,
                'ds_status': 'ds_status',
                'ds_last_finished': datetime.strptime(last_finished, '%Y-%m-%dT%H:%M:%S'),
                'ds_is_public': True,
                'ds_ion_img_storage': 'fs',
                'ds_acq_geometry': {}
            }]
        elif sql == ANNOTATIONS_SEL:
            return [{
                'sf': 'H2O',
                'sf_adduct': 'H2O+H',
                'chaos': 1,
                'image_corr': 1,
                'pattern_match': 1,
                'total_iso_ints': 100,
                'min_iso_ints': 0,
                'max_iso_ints': 100,
                'msm': 1,
                'adduct': '+H',
                'job_id': 1,
                'fdr': 0.1,
                'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'],
                'polarity': '+'
            }, {
                'sf': 'Au',
                'sf_adduct': 'Au+H',
                'chaos': 1,
                'image_corr': 1,
                'pattern_match': 1,
                'total_iso_ints': 100,
                'min_iso_ints': 0,
                'max_iso_ints': 100,
                'msm': 1,
                'adduct': '+H',
                'job_id': 1,
                'fdr': 0.05,
                'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'],
                'polarity': '+'
            }]
        else:
            logging.getLogger('engine').error('Wrong db_sel_side_effect arguments: ', args)

    db_mock = MagicMock(spec=DB)
    db_mock.select_with_fields.side_effect = db_sel_side_effect

    mol_db_mock = MagicMock(MolecularDB)
    mol_db_mock.id = mol_db_id
    mol_db_mock.name = 'db_name'
    mol_db_mock.version = '2017'
    mol_db_mock.get_molecules.return_value = pd.DataFrame([('H2O', 'mol_id', 'mol_name'), ('Au', 'mol_id', 'mol_name')],
                                                          columns=['sf', 'mol_id', 'mol_name'])

    isocalc_mock = MagicMock(IsocalcWrapper)
    isocalc_mock.ion_centroids = lambda sf, adduct: {
        ('H2O', '+H'): ([100., 200.], None),
        ('Au', '+H'): ([10., 20.], None)
    }[(sf, adduct)]

    es_exp = ESExporter(db_mock)
    es_exp.delete_ds(ds_id)
    es_exp.index_ds(ds_id=ds_id, mol_db=mol_db_mock, isocalc=isocalc_mock)

    wait_for_es(sec=1)

    ds_d = es_dsl_search.filter('term', _type='dataset').execute().to_dict()['hits']['hits'][0]['_source']
    assert ds_d == {
        'ds_last_finished': last_finished, 'ds_config': 'ds_config', 'ds_meta': {},
        'ds_status': 'ds_status', 'ds_name': 'ds_name', 'ds_input_path': 'ds_input_path', 'ds_id': ds_id,
        'ds_upload_dt': upload_dt,
        'annotation_counts': [{'db': {'name': 'db_name', 'version': '2017'},
                               'counts': [{'level': 5, 'n': 1}, {'level': 10, 'n': 2},
                                          {'level': 20, 'n': 2}, {'level': 50, 'n': 2}]}],
        'ds_is_public': True,
        'ds_acq_geometry': {},
        'ds_ion_img_storage': 'fs'
    }
    ann_1_d = es_dsl_search.filter('term', sf='H2O').execute().to_dict()['hits']['hits'][0]['_source']
    assert ann_1_d == {
        'pattern_match': 1, 'image_corr': 1, 'fdr': 0.1, 'chaos': 1, 'sf': 'H2O', 'min_iso_ints': 0,
        'msm': 1, 'sf_adduct': 'H2O+H', 'total_iso_ints': 100, 'centroid_mzs': [100., 200.],
        'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'], 'polarity': '+', 'job_id': 1, 'max_iso_ints': 100,
        'adduct': '+H', 'ds_name': 'ds_name', 'annotation_counts': [], 'db_version': '2017', 'ds_status': 'ds_status',
        'ion_add_pol': '[M+H]+', 'comp_names': ['mol_name'], 'db_name': 'db_name', 'mz': 100., 'ds_meta': {},
        'comp_ids': ['mol_id'], 'ds_config': 'ds_config', 'ds_input_path': 'ds_input_path', 'ds_id': ds_id,
        'ds_upload_dt': upload_dt, 'ds_last_finished': last_finished,
        'ds_ion_img_storage': 'fs', 'ds_is_public': True
    }
    ann_2_d = es_dsl_search.filter('term', sf='Au').execute().to_dict()['hits']['hits'][0]['_source']
    assert ann_2_d == {
        'pattern_match': 1, 'image_corr': 1, 'fdr': 0.05, 'chaos': 1, 'sf': 'Au', 'min_iso_ints': 0,
        'msm': 1, 'sf_adduct': 'Au+H', 'total_iso_ints': 100, 'centroid_mzs': [10., 20.],
        'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'], 'polarity': '+', 'job_id': 1, 'max_iso_ints': 100,
        'adduct': '+H',  'ds_name': 'ds_name', 'annotation_counts': [], 'db_version': '2017', 'ds_status': 'ds_status',
        'ion_add_pol': '[M+H]+', 'comp_names': ['mol_name'], 'db_name': 'db_name', 'mz': 10., 'ds_meta': {},
        'comp_ids': ['mol_id'], 'ds_config': 'ds_config', 'ds_input_path': 'ds_input_path', 'ds_id': ds_id,
        'ds_upload_dt': upload_dt, 'ds_last_finished': last_finished,
        'ds_ion_img_storage': 'fs', 'ds_is_public': True
    }
コード例 #30
0
                        help='Path to a dataset location')
    parser.add_argument(
        '--no-clean',
        dest='no_clean',
        action='store_true',
        help="Don't clean dataset txt files after job is finished")
    parser.add_argument('--config',
                        dest='sm_config_path',
                        default='conf/config.json',
                        type=str,
                        help='SM config path')
    args = parser.parse_args()

    SMConfig.set_path(args.sm_config_path)
    sm_config = SMConfig.get_conf()
    init_loggers(sm_config['logs'])

    db = DB(sm_config['db'])
    img_store = ImageStoreServiceWrapper(
        sm_config['services']['img_service_url'])
    manager = SMDaemonManager(db, ESExporter(db), img_store)

    try:
        ds = create_ds_from_files(args.ds_id, args.ds_name, args.input_path)
        manager.annotate(ds, SearchJob, del_first=True)
    except Exception as e:
        logging.getLogger('engine').error(e, exc_info=True)
        sys.exit(1)

    sys.exit()
コード例 #31
0
def test_index_ds_works(sm_config, test_db, es, es_dsl_search, sm_index,
                        ds_config, metadata, annotation_stats):
    ds_id = '2000-01-01_00h00m'
    upload_dt = datetime.now().isoformat()
    last_finished = '2017-01-01 00:00:00'
    iso_image_ids = ['iso_img_id_1', 'iso_img_id_2']
    stats = json.dumps(annotation_stats)

    db = DB()
    db.insert(
        "INSERT INTO dataset(id, name, input_path, config, metadata, upload_dt, status, "
        "status_update_dt, is_public, acq_geometry, ion_thumbnail) "
        "VALUES (%s, 'ds_name', 'ds_input_path', %s, %s, %s, 'ds_status', %s, true, '{}', %s)",
        [[
            ds_id,
            json.dumps(ds_config),
            json.dumps(metadata), upload_dt, upload_dt, 'thumb-id'
        ]],
    )
    moldb = create_test_molecular_db()
    (job_id, ) = db.insert_return(
        "INSERT INTO job(ds_id, moldb_id, status, start, finish) "
        "VALUES (%s, %s, 'job_status', %s, %s) RETURNING id",
        rows=[(ds_id, moldb.id, last_finished, last_finished)],
    )
    (user_id, ) = db.insert_return(
        "INSERT INTO graphql.user (email, name, role) "
        "VALUES ('email', 'user_name', 'user') RETURNING id",
        [[]],
    )
    (group_id, ) = db.insert_return(
        "INSERT INTO graphql.group (name, short_name) VALUES ('group name', 'grp') RETURNING id",
        [[]],
    )
    db.insert(
        "INSERT INTO graphql.dataset(id, user_id, group_id) VALUES (%s, %s, %s)",
        [[ds_id, user_id, group_id]],
    )
    ion_id1, ion_id2 = db.insert_return(
        "INSERT INTO graphql.ion(ion, formula, chem_mod, neutral_loss, adduct, charge, ion_formula) "
        "VALUES (%s, %s, %s, %s, %s, %s, %s) RETURNING id",
        [
            ['H2O-H+O-H+H', 'H2O', '-H+O', '-H', '+H', 1, 'HO2'],
            ['Au+H', 'Au', '', '', '+H', 1, 'HAu'],
        ],
    )
    db.insert(
        "INSERT INTO annotation(job_id, formula, chem_mod, neutral_loss, adduct, "
        "msm, fdr, stats, iso_image_ids, ion_id) "
        "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
        [
            [
                job_id, 'H2O', '-H+O', '-H', '+H', 1, 0.1, stats,
                iso_image_ids, ion_id1
            ],
            [
                job_id, 'Au', '', '', '+H', 1, 0.05, stats, iso_image_ids,
                ion_id2
            ],
        ],
    )

    isocalc_mock = MagicMock(IsocalcWrapper)
    isocalc_mock.centroids = lambda formula: {
        'H2O+H': ([100.0, 200.0], None),
        'H2O-H+O-H+H': ([100.0, 200.0, 300.0], None),
        'Au+H': ([10.0, 20.0], None),
    }[formula]
    isocalc_mock.mass_accuracy_bounds = lambda mzs: (mzs, mzs)

    with patch(
            'sm.engine.es_export.molecular_db.fetch_molecules',
            return_value=pd.DataFrame(
                [('H2O', 'mol_id', 'mol_name'), ('Au', 'mol_id', 'mol_name')],
                columns=['formula', 'mol_id', 'mol_name'],
            ),
    ):
        es_exp = ESExporter(db, sm_config)
        es_exp.delete_ds(ds_id)
        es_exp.index_ds(
            ds_id=ds_id,
            moldb=moldb,
            isocalc=isocalc_mock,
        )

    wait_for_es(es, sm_config['elasticsearch']['index'])

    ds_d = (es_dsl_search.filter(
        'term',
        _type='dataset').execute().to_dict()['hits']['hits'][0]['_source'])
    expected_ds_fields = {
        'ds_last_finished': last_finished,
        'ds_config': ds_config,
        'ds_adducts': ds_config['isotope_generation']['adducts'],
        'ds_moldb_ids': ds_config['database_ids'],
        'ds_chem_mods': [],
        'ds_neutral_losses': [],
        'ds_project_ids': [],
        'ds_project_names': [],
        'ds_meta': metadata,
        'ds_status': 'ds_status',
        'ds_status_update_dt': upload_dt,
        'ds_name': 'ds_name',
        'ds_input_path': 'ds_input_path',
        'ds_id': ds_id,
        'ds_upload_dt': upload_dt,
        'ds_is_public': True,
        'ds_submitter_email': 'email',
        'ds_submitter_id': user_id,
        'ds_submitter_name': 'user_name',
        'ds_group_approved': False,
        'ds_group_id': group_id,
        'ds_group_name': 'group name',
        'ds_group_short_name': 'grp',
    }
    assert ds_d == {
        **expected_ds_fields,
        'ds_acq_geometry': {},
        'annotation_counts': [{
            'db': {
                'id': moldb.id,
                'name': moldb.name
            },
            'counts': [
                {
                    'level': 5,
                    'n': 1
                },
                {
                    'level': 10,
                    'n': 2
                },
                {
                    'level': 20,
                    'n': 2
                },
                {
                    'level': 50,
                    'n': 2
                },
            ],
        }],
    }
    ann_1_d = (es_dsl_search.filter(
        'term',
        formula='H2O').execute().to_dict()['hits']['hits'][0]['_source'])
    top_level_stats = {
        'pattern_match': annotation_stats['spectral'],
        'image_corr': annotation_stats['spatial'],
        'chaos': annotation_stats['chaos'],
        **{
            key: value
            for key, value in annotation_stats.items() if key in NON_METRIC_STATS
        },
    }
    metrics = {
        key: value
        for key, value in annotation_stats.items()
        if key not in NON_METRIC_STATS
    }
    assert ann_1_d == {
        **expected_ds_fields,
        **top_level_stats,
        'metrics':
        metrics,
        'fdr':
        0.1,
        'formula':
        'H2O',
        'msm':
        1.0,
        'ion':
        'H2O-H+O-H+H+',
        'ion_formula':
        'HO2',
        'centroid_mzs': [100.0, 200.0, 300.0],
        'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'],
        'iso_image_urls': [
            f'http://localhost:9000/{sm_config["image_storage"]["bucket"]}/iso/{ds_id}/iso_img_id_1',
            f'http://localhost:9000/{sm_config["image_storage"]["bucket"]}/iso/{ds_id}/iso_img_id_2',
        ],
        'isobars': [],
        'isomer_ions': [],
        'polarity':
        '+',
        'job_id':
        1,
        'adduct':
        '+H',
        'neutral_loss':
        '-H',
        'chem_mod':
        '-H+O',
        'annotation_counts': [],
        'comp_names': ['mol_name'],
        'comps_count_with_isomers':
        1,
        'db_id':
        moldb.id,
        'db_name':
        moldb.name,
        'db_version':
        moldb.version,
        'mz':
        100.0,
        'comp_ids': ['mol_id'],
        'annotation_id':
        1,
        'off_sample_label':
        None,
        'off_sample_prob':
        None,
    }
    ann_2_d = (es_dsl_search.filter(
        'term',
        formula='Au').execute().to_dict()['hits']['hits'][0]['_source'])
    assert ann_2_d == {
        **expected_ds_fields,
        **top_level_stats,
        'metrics':
        metrics,
        'fdr':
        0.05,
        'formula':
        'Au',
        'msm':
        1.0,
        'ion':
        'Au+H+',
        'ion_formula':
        'HAu',
        'centroid_mzs': [10.0, 20.0],
        'iso_image_ids': ['iso_img_id_1', 'iso_img_id_2'],
        'iso_image_urls': [
            f'http://localhost:9000/{sm_config["image_storage"]["bucket"]}/iso/{ds_id}/iso_img_id_1',
            f'http://localhost:9000/{sm_config["image_storage"]["bucket"]}/iso/{ds_id}/iso_img_id_2',
        ],
        'isobars': [],
        'isomer_ions': [],
        'polarity':
        '+',
        'job_id':
        1,
        'adduct':
        '+H',
        'neutral_loss':
        '',
        'chem_mod':
        '',
        'annotation_counts': [],
        'comp_names': ['mol_name'],
        'comps_count_with_isomers':
        1,
        'db_id':
        moldb.id,
        'db_name':
        moldb.name,
        'db_version':
        moldb.version,
        'mz':
        10.0,
        'comp_ids': ['mol_id'],
        'annotation_id':
        2,
        'off_sample_label':
        None,
        'off_sample_prob':
        None,
    }
コード例 #32
0
def test_sm_daemon_es_export_fails(
    MSMSearchMock,
    post_images_to_image_store_mock,
    test_db,
    es_dsl_search,
    clean_isotope_storage,
    reset_queues,
    metadata,
    ds_config,
    queue_pub,
    local_sm_config,
):
    moldb = init_moldb()

    formula_metrics_df = pd.DataFrame({
        'formula_i': [0, 1, 2],
        'ion_formula': ['C12H24O-H2O+H', 'C12H24O-H2+O2-CO+Na', 'C12H24O+K'],
        'formula': ['C12H24O', 'C12H24O', 'C12H24O'],
        'modifier': ['-H2O+H', '-H2+O2-CO+Na', '+K'],
        'chem_mod': ['', '-H2+O2', ''],
        'neutral_loss': ['-H2O', '-CO', ''],
        'adduct': ['+H', '+Na', '+K'],
        'chaos': [0.9, 0.9, 0.9],
        'spatial': [0.9, 0.9, 0.9],
        'spectral': [0.9, 0.9, 0.9],
        'msm': [0.9**3, 0.9**3, 0.9**3],
        'total_iso_ints': [[100.0], [100.0], [100.0]],
        'min_iso_ints': [[0], [0], [0]],
        'max_iso_ints': [[10.0], [10.0], [10.0]],
        'fdr': [0.1, 0.1, 0.1],
    }).set_index('formula_i')
    search_algo_mock = MSMSearchMock()
    search_algo_mock.search.return_value = [
        (formula_metrics_df, [], create_test_fdr_diagnostics_bundle())
    ]
    search_algo_mock.metrics = OrderedDict([
        ('chaos', 0),
        ('spatial', 0),
        ('spectral', 0),
        ('msm', 0),
        ('total_iso_ints', []),
        ('min_iso_ints', []),
        ('max_iso_ints', []),
    ])
    image_ids = ['iso_image_1', None, None, None]
    post_images_to_image_store_mock.return_value = {
        0: image_ids,
        1: image_ids,
        2: image_ids
    }

    db = DB()

    def throw_exception_function(*args, **kwargs):
        raise Exception('Test')

    es = ESExporter(db, local_sm_config)
    es.index_ds = throw_exception_function

    ds = create_test_ds(
        name=test_ds_name,
        input_path=input_dir_path,
        config={
            **ds_config, 'database_ids': [moldb.id]
        },
        status=DatasetStatus.QUEUED,
        es=es,
    )

    queue_pub.publish({
        'ds_id': ds.id,
        'ds_name': test_ds_name,
        'action': DaemonAction.ANNOTATE
    })

    run_daemons(db, es, local_sm_config)

    # dataset and job tables asserts
    row = db.select_one('SELECT status from job')
    assert row[0] == 'FINISHED'
    row = db.select_one('SELECT status from dataset')
    assert row[0] == 'FAILED'
コード例 #33
0
def test_sm_daemons(get_compute_img_metrics_mock, filter_sf_metrics_mock,
                    post_images_to_annot_service_mock,
                    MolDBServiceWrapperMock,
                    sm_config, test_db, es_dsl_search, clean_isotope_storage):
    init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock)

    get_compute_img_metrics_mock.return_value = lambda *args: (0.9, 0.9, 0.9, [100.], [0], [10.])
    filter_sf_metrics_mock.side_effect = lambda x: x

    url_dict = {
        'iso_image_ids': ['iso_image_1', None, None, None]
    }
    post_images_to_annot_service_mock.return_value = {
        35: url_dict,
        44: url_dict
    }

    db = DB(sm_config['db'])
    es = ESExporter(db)
    annotate_daemon = None
    update_daemon = None

    try:
        ds_config_str = open(ds_config_path).read()
        upload_dt = datetime.now()
        ds_id = '2000-01-01_00h00m'
        db.insert(Dataset.DS_INSERT, [{
            'id': ds_id,
            'name': test_ds_name,
            'input_path': input_dir_path,
            'upload_dt': upload_dt,
            'metadata': '{"Data_Type": "Imaging MS"}',
            'config': ds_config_str,
            'status': DatasetStatus.QUEUED,
            'is_public': True,
            'mol_dbs': ['HMDB-v4'],
            'adducts': ['+H', '+Na', '+K'],
            'ion_img_storage': 'fs'
        }])

        ds = Dataset.load(db, ds_id)
        queue_pub.publish({'ds_id': ds.id, 'ds_name': ds.name, 'action': 'annotate'})

        run_daemons(db, es)

        # dataset table asserts
        rows = db.select('SELECT id, name, input_path, upload_dt, status from dataset')
        input_path = join(dirname(__file__), 'data', test_ds_name)
        assert len(rows) == 1
        assert rows[0] == (ds_id, test_ds_name, input_path, upload_dt, DatasetStatus.FINISHED)

        # ms acquisition geometry asserts
        rows = db.select('SELECT acq_geometry from dataset')
        assert len(rows) == 1
        assert rows[0][0] == ds.get_acq_geometry(db)
        assert rows[0][0] == {
            ACQ_GEOMETRY_KEYS.LENGTH_UNIT: 'nm',
            ACQ_GEOMETRY_KEYS.AcqGridSection.section_name: {
                ACQ_GEOMETRY_KEYS.AcqGridSection.REGULAR_GRID: True,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_X : 3,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_COUNT_Y : 3,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_X : 100,
                ACQ_GEOMETRY_KEYS.AcqGridSection.PIXEL_SPACING_Y : 100
            },
            ACQ_GEOMETRY_KEYS.PixelSizeSection.section_name: {
                ACQ_GEOMETRY_KEYS.PixelSizeSection.REGULAR_SIZE: True,
                ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_X : 100,
                ACQ_GEOMETRY_KEYS.PixelSizeSection.PIXEL_SIZE_Y : 100
            }
        }

        # job table asserts
        rows = db.select('SELECT db_id, ds_id, status, start, finish from job')
        assert len(rows) == 1
        db_id, ds_id, status, start, finish = rows[0]
        assert (db_id, ds_id, status) == (0, '2000-01-01_00h00m', JobStatus.FINISHED)
        assert start < finish

        # image metrics asserts
        rows = db.select(('SELECT db_id, sf, adduct, stats, iso_image_ids '
                          'FROM iso_image_metrics '
                          'ORDER BY sf, adduct'))

        assert rows[0] == (0, 'C12H24O', '+K', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9,
                                                'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]},
                           ['iso_image_1', None, None, None])
        assert rows[1] == (0, 'C12H24O', '+Na', {'chaos': 0.9, 'spatial': 0.9, 'spectral': 0.9,
                                                 'total_iso_ints': [100.], 'min_iso_ints': [0], 'max_iso_ints': [10.]},
                           ['iso_image_1', None, None, None])

        time.sleep(1)  # Waiting for ES
        # ES asserts
        ds_docs = es_dsl_search.query('term', _type='dataset').execute().to_dict()['hits']['hits']
        assert 1 == len(ds_docs)
        ann_docs = es_dsl_search.query('term', _type='annotation').execute().to_dict()['hits']['hits']
        assert len(ann_docs) == len(rows)
        for doc in ann_docs:
            assert doc['_id'].startswith(ds_id)

    finally:
        db.close()
        if annotate_daemon:
            annotate_daemon.stop()
        if update_daemon:
            update_daemon.stop()
        with warn_only():
            local('rm -rf {}'.format(data_dir_path))
コード例 #34
0
import argparse
from os.path import abspath
import json

from sm.engine.es_export import ESExporter


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Create ElasticSearch indices')
    parser.add_argument('--conf', default='conf/config.json', help="SM config path")
    parser.add_argument('--drop', action='store_true', help='Delete index if exists')

    args = parser.parse_args()

    name = 'sm'
    with open(abspath(args.conf)) as f:
        es_exp = ESExporter(json.load(f))

        if args.drop:
            es_exp.delete_index(name)

        es_exp.create_index(name)
コード例 #35
0
def test_delete_ds__completely(sm_config, test_db, es, sm_index):
    moldb = MolecularDB(0, 'HMDB', '2016')
    moldb2 = MolecularDB(1, 'ChEBI', '2016')

    index = sm_config['elasticsearch']['index']
    es.create(
        index=index,
        doc_type='annotation',
        id='id1',
        body={
            'ds_id': 'dataset1',
            'db_id': moldb.id,
            'db_name': moldb.name,
            'db_version': moldb.version,
        },
    )
    es.create(
        index=index,
        doc_type='annotation',
        id='id2',
        body={
            'ds_id': 'dataset1',
            'db_id': moldb2.id,
            'db_name': moldb2.name,
            'db_version': moldb2.version,
        },
    )
    es.create(
        index=index,
        doc_type='annotation',
        id='id3',
        body={
            'ds_id': 'dataset2',
            'db_id': moldb.id,
            'db_name': moldb.name,
            'db_version': moldb.version,
        },
    )
    es.create(
        index=index,
        doc_type='dataset',
        id='dataset1',
        body={
            'ds_id': 'dataset1',
            'db_id': moldb.id,
            'db_name': moldb.name,
            'db_version': moldb.version,
        },
    )

    wait_for_es(es, index)

    db_mock = MagicMock(spec=DB)

    es_exporter = ESExporter(db_mock, sm_config)
    es_exporter.delete_ds(ds_id='dataset1')

    wait_for_es(es, index)

    body = {'query': {'bool': {'filter': []}}}
    body['query']['bool']['filter'] = [
        {
            'term': {
                'ds_id': 'dataset1'
            }
        },
        {
            'term': {
                'db_id': moldb.id
            }
        },
    ]
    assert es.count(index=index, doc_type='annotation',
                    body=body)['count'] == 0
    body['query']['bool']['filter'] = [
        {
            'term': {
                'ds_id': 'dataset1'
            }
        },
        {
            'term': {
                'db_id': moldb2.id
            }
        },
    ]
    assert es.count(index=index, doc_type='annotation',
                    body=body)['count'] == 0
    body['query']['bool']['filter'] = [
        {
            'term': {
                'ds_id': 'dataset2'
            }
        },
        {
            'term': {
                'db_id': moldb.id
            }
        },
    ]
    assert es.count(index=index, doc_type='annotation',
                    body=body)['count'] == 1
    body['query']['bool']['filter'] = [
        {
            'term': {
                'ds_id': 'dataset1'
            }
        },
        {
            'term': {
                '_type': 'dataset'
            }
        },
    ]
    assert es.count(index=index, doc_type='dataset', body=body)['count'] == 0