Esempio n. 1
0
def test_save_ds_meta_ds_doesnt_exist(spark_context, create_test_db,
                                      drop_test_db, sm_config, ds_config):
    work_dir_man_mock = MagicMock(WorkDirManager)
    work_dir_man_mock.ds_coord_path = '/ds_path'
    work_dir_man_mock.txt_path = '/txt_path'

    SMConfig._config_dict = sm_config

    with patch('sm.engine.tests.util.SparkContext.textFile') as m:
        m.return_value = spark_context.parallelize(['0,1,1\n', '1,100,200\n'])

        dataset = Dataset(spark_context, 'ds_name', '', 'input_path',
                          ds_config, work_dir_man_mock, DB(sm_config['db']))
        dataset.save_ds_meta()

    db = DB(sm_config['db'])
    ds_row = db.select_one(
        'SELECT name, file_path, img_bounds, config from dataset')
    assert ds_row == ('ds_name', 'input_path', {
        u'x': {
            u'min': 1,
            u'max': 100
        },
        u'y': {
            u'min': 1,
            u'max': 200
        }
    }, ds_config)

    coord_row = db.select_one('SELECT xs, ys from coordinates')
    assert coord_row == ([1, 100], [1, 200])

    db.close()
Esempio n. 2
0
def test_save_ds_meta_ds_doesnt_exist(spark_context, create_test_db, drop_test_db, sm_config, ds_config):
    work_dir_man_mock = MagicMock(WorkDirManager)
    work_dir_man_mock.ds_coord_path = '/ds_path'
    work_dir_man_mock.txt_path = '/txt_path'

    SMConfig._config_dict = sm_config

    with patch('sm.engine.tests.util.SparkContext.textFile') as m:
        m.return_value = spark_context.parallelize([
            '0,1,1\n',
            '1,100,200\n'])

        dataset = Dataset(spark_context, 'ds_name', '', ds_config, work_dir_man_mock, DB(sm_config['db']))
        dataset.save_ds_meta()

    db = DB(sm_config['db'])
    ds_row = db.select_one('SELECT name, file_path, img_bounds, config from dataset')
    assert ds_row == ('ds_name', '/txt_path',
                      {u'x': {u'min': 1, u'max': 100}, u'y': {u'min': 1, u'max': 200}},
                      ds_config)

    coord_row = db.select_one('SELECT xs, ys from coordinates')
    assert coord_row == ([1, 100], [1, 200])

    db.close()
Esempio n. 3
0
def test_sm_daemons_annot_fails(get_compute_img_metrics_mock, filter_sf_metrics_mock,
                                post_images_to_annot_service_mock,
                                MolDBServiceWrapperMock,
                                sm_config, test_db, es_dsl_search,
                                clean_isotope_storage):
    init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock)

    def throw_exception_function(*args):
        raise Exception('Test')
    get_compute_img_metrics_mock.return_value = throw_exception_function
    filter_sf_metrics_mock.side_effect = lambda x: x

    url_dict = {
        'iso_image_ids': ['iso_image_1', None, None, None]
    }
    post_images_to_annot_service_mock.return_value = {
        35: url_dict,
        44: url_dict
    }

    db = DB(sm_config['db'])
    es = ESExporter(db)
    annotate_daemon = None

    try:
        ds_id = '2000-01-01_00h00m'
        upload_dt = datetime.now()
        ds_config_str = open(ds_config_path).read()
        db.insert(Dataset.DS_INSERT, [{
            'id': ds_id,
            'name': test_ds_name,
            'input_path': input_dir_path,
            'upload_dt': upload_dt,
            'metadata': '{}',
            'config': ds_config_str,
            'status': DatasetStatus.QUEUED,
            'is_public': True,
            'mol_dbs': ['HMDB-v4'],
            'adducts': ['+H', '+Na', '+K'],
            'ion_img_storage': 'fs'
        }])

        ds = Dataset.load(db, ds_id)
        queue_pub.publish({'ds_id': ds.id, 'ds_name': ds.name, 'action': 'annotate'})

        run_daemons(db, es)

        # dataset and job tables asserts
        row = db.select_one('SELECT status from dataset')
        assert row[0] == 'FAILED'
        row = db.select_one('SELECT status from job')
        assert row[0] == 'FAILED'
    finally:
        db.close()
        if annotate_daemon:
            annotate_daemon.stop()
        with warn_only():
            local('rm -rf {}'.format(data_dir_path))
def test_search_job_imzml_example_es_export_fails(get_compute_img_metrics_mock, filter_sf_metrics_mock,
                                                  post_images_to_annot_service_mock,
                                                  MolDBServiceWrapperMock, MolDBServiceWrapperMock2,
                                                  sm_config, create_fill_sm_database, es_dsl_search,
                                                  clean_isotope_storage):
    init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock)
    init_mol_db_service_wrapper_mock(MolDBServiceWrapperMock2)

    get_compute_img_metrics_mock.return_value = lambda *args: (0.9, 0.9, 0.9, [100.], [0], [10.])
    filter_sf_metrics_mock.side_effect = lambda x: x

    url_dict = {
        'iso_image_ids': ['iso_image_1', None, None, None]
    }
    post_images_to_annot_service_mock.return_value = {
        35: url_dict,
        44: url_dict
    }

    db = DB(sm_config['db'])

    def throw_exception_function(*args):
        raise Exception('Test')

    try:
        ds_id = '2000-01-01_00h00m'
        upload_dt = datetime.now()
        ds_config_str = open(ds_config_path).read()
        db.insert(Dataset.DS_INSERT, [{
            'id': ds_id,
            'name': test_ds_name,
            'input_path': input_dir_path,
            'upload_dt': upload_dt,
            'metadata': '{}',
            'config': ds_config_str,
            'status': DatasetStatus.QUEUED,
            'is_public': True,
            'mol_dbs': ['HMDB-v4'],
            'adducts': ['+H', '+Na', '+K'],
            'ion_img_storage': 'fs'
        }])

        with patch('sm.engine.search_job.ESExporter.index_ds') as index_ds_mock:
            index_ds_mock.side_effect = throw_exception_function

            img_store = ImageStoreServiceWrapper(sm_config['services']['img_service_url'])
            job = SearchJob(img_store=img_store)
            ds = Dataset.load(db, ds_id)
            job.run(ds)
    except ESExportFailedError as e:
        assert e
        # dataset table asserts
        row = db.select_one('SELECT status from dataset')
        assert row[0] == 'FAILED'
    else:
        raise AssertionError('ESExportFailedError should be raised')
    finally:
        db.close()
        with warn_only():
            local('rm -rf {}'.format(data_dir_path))
def update_optical_images(ds_id_str, sql_where):
    db = DB()

    if ds_id_str:
        ds_ids = ds_id_str.split(',')
    else:
        ds_ids = [
            id for (id, ) in db.select(
                f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}')
        ]

    for i, ds_id in enumerate(ds_ids):
        try:
            transform, img_id = db.select_one(
                'SELECT transform, optical_image from dataset WHERE id = %s',
                params=(ds_id, ))
            if img_id and transform:
                logger.info(
                    f'[{i + 1}/{len(ds_ids)}] Updating optical image of dataset {ds_id}'
                )
                add_optical_image(db, ds_id, img_id, transform)
            else:
                logger.info(
                    f'[{i + 1}/{len(ds_ids)}] Skipping dataset {ds_id}')
        except Exception:
            logger.error(f'Failed to update optical image on {ds_id}',
                         exc_info=True)
def migrate_optical_images(ds_id):
    output.print('Migrating optical images')

    with timeit():
        output.print('Transferring images and updating database...')
        db = DB()
        rows = db.select(SEL_OPTICAL_IMGS, params=(ds_id,))
        for opt_image_id, opt_image_url in rows:
            if not opt_image_url and opt_image_id:
                transfer_images(
                    ds_id,
                    'optical_images',
                    image_storage.OPTICAL,
                    [opt_image_id],
                )
                opt_image_url = image_storage.get_image_url(
                    image_storage.OPTICAL, ds_id, opt_image_id
                )
                db.alter(UPD_OPTICAL_IMGS, params=(opt_image_url, opt_image_id))

        opt_thumb_id, opt_thumb_url = db.select_one(SEL_OPT_THUMB, params=(ds_id,))
        if not opt_thumb_url and opt_thumb_id:
            transfer_images(
                ds_id,
                'optical_images',
                image_storage.OPTICAL,
                [opt_thumb_id],
            )
            opt_thumb_url = image_storage.get_image_url(image_storage.OPTICAL, ds_id, opt_thumb_id)
            db.alter(UPD_OPT_THUMB, params=(opt_thumb_url, ds_id))
    def test_delete_ds(self, EsMock, fill_db):
        db = DB()
        manager = create_daemon_man(db=db, es=EsMock())

        ds_id = '2000-01-01'
        ds = create_ds(ds_id=ds_id)

        manager.delete(ds)

        EsMock.return_value.delete_ds.assert_has_calls([call(ds_id)])
        assert db.select_one('SELECT * FROM dataset WHERE id = %s', params=(ds_id,)) == []
Esempio n. 8
0
def test_create_moldb_malformed_csv(file, fill_db):
    input_doc = moldb_input_doc(file_path=f's3://{BUCKET_NAME}/{file.value}')
    with patch_bottle_request(input_doc):

        resp = api.databases.create()

        assert resp['status'] == MALFORMED_CSV['status']
        assert resp['error']

        db = DB()
        (db_count, ) = db.select_one(MOLDB_COUNT_SEL)
        assert db_count == 0
Esempio n. 9
0
def test_delete_moldb(fill_db):
    input_doc = moldb_input_doc(
        file_path=f's3://{BUCKET_NAME}/{MoldbFiles.VALID.value}')
    moldb = create_test_molecular_db(**input_doc)
    with patch_bottle_request(req_doc={}):

        resp = api.databases.delete(moldb_id=moldb.id)

        assert resp['status'] == 'success'

        db = DB()
        (db_count, ) = db.select_one(MOLDB_COUNT_SEL)
        assert db_count == 0
Esempio n. 10
0
def test_create_moldb_empty_values(fill_db):
    input_doc = moldb_input_doc(
        file_path=f's3://{BUCKET_NAME}/{MoldbFiles.EMPTY_VALUES.value}')
    with patch_bottle_request(input_doc):

        resp = api.databases.create()

        assert resp['status'] == BAD_DATA['status']
        assert resp['error'] and resp['details']

        db = DB()
        (db_count, ) = db.select_one(MOLDB_COUNT_SEL)
        assert db_count == 0
def save_additional_info_to_db(db_id, user_id, input_path):
    conf = SMConfig.get_conf()
    with ConnectionPool(conf['db']):
        db = DB()
        if db.select_one('SELECT * FROM molecular_db WHERE id = %s',
                         (db_id, )):
            print(f'Updating existing molecular database {db_id}')
            DB().alter(
                'UPDATE molecular_db SET user_id = %s, input_path = %s WHERE id = %s',
                (user_id, input_path, db_id),
            )
        else:
            print(f'Specified molecular database {db_id} does not exist.')
Esempio n. 12
0
def test_sm_daemons_annot_fails(
    MSMSearchMock,
    post_images_to_image_store_mock,
    test_db,
    es_dsl_search,
    clean_isotope_storage,
    reset_queues,
    metadata,
    ds_config,
    queue_pub,
    local_sm_config,
):
    moldb = init_moldb()

    def throw_exception_function(*args, **kwargs):
        raise Exception('Test exception')

    msm_algo_mock = MSMSearchMock()
    msm_algo_mock.search.side_effect = throw_exception_function

    image_ids = ['iso_image_1', None, None, None]
    post_images_to_image_store_mock.return_value = {
        0: image_ids,
        1: image_ids,
        2: image_ids
    }

    db = DB()
    es = ESExporter(db, local_sm_config)
    ds = create_test_ds(
        name=test_ds_name,
        input_path=input_dir_path,
        config={
            **ds_config, 'database_ids': [moldb.id]
        },
        status=DatasetStatus.QUEUED,
        es=es,
    )

    queue_pub.publish({
        'ds_id': ds.id,
        'ds_name': test_ds_name,
        'action': DaemonAction.ANNOTATE
    })

    run_daemons(db, es, local_sm_config)

    # dataset and job tables asserts
    row = db.select_one('SELECT status from dataset')
    assert len(row) == 1
    assert row[0] == 'FAILED'
    def test_delete_ds(self, fill_db, sm_config, ds_config):
        db = DB(sm_config['db'])
        es_mock = MagicMock(spec=ESExporter)
        img_store_service_mock = MagicMock(spec=ImageStoreServiceWrapper)
        manager = create_daemon_man(sm_config, db=db, es=es_mock, img_store=img_store_service_mock)

        ds_id = '2000-01-01'
        ds = create_ds(ds_id=ds_id, ds_config=ds_config)

        manager.delete(ds)

        ids = ['iso_image_{}_id'.format(id) for id in range(1, 3)]
        img_store_service_mock.delete_image_by_id.assert_has_calls(
            [call('fs', 'iso_image', ids[0]), call('fs', 'iso_image', ids[1])])
        es_mock.delete_ds.assert_called_with(ds_id)
        assert db.select_one('SELECT * FROM dataset WHERE id = %s', params=(ds_id,)) == []
def migrate_ion_thumbnail(ds_id):
    output.print('Migrating ion thumbnail images')

    with timeit():
        output.print('Transferring images and updating database...')
        db = DB()
        ion_thumb_id, ion_thumbnail_url = db.select_one(SEL_ION_THUMB, params=(ds_id,))
        if not ion_thumbnail_url and ion_thumb_id:
            transfer_images(
                ds_id,
                'ion_thumbnails',
                image_storage.THUMB,
                [ion_thumb_id],
            )
            ion_thumb_url = image_storage.get_image_url(image_storage.THUMB, ds_id, ion_thumb_id)
            db.alter(UPD_ION_THUMB, params=(ion_thumb_url, ds_id))
Esempio n. 15
0
def test_create_moldb_wrong_formulas(fill_db):
    input_doc = moldb_input_doc(
        file_path=f's3://{BUCKET_NAME}/{MoldbFiles.WRONG_FORMULAS.value}')
    with patch_bottle_request(input_doc):

        resp = api.databases.create()

        assert resp['status'] == BAD_DATA['status']
        assert resp['error'], resp['details']
        for err_row in resp['details']:
            assert all([
                err_row.get(err_field, None)
                for err_field in ['line', 'row', 'error']
            ])

        db = DB()
        (db_count, ) = db.select_one(MOLDB_COUNT_SEL)
        assert db_count == 0
Esempio n. 16
0
def test_creates_ion_thumbnail(test_db, algorithm, metadata, ds_config):
    db = DB()
    ds = _make_fake_ds(db, metadata, ds_config)

    with patch('sm.engine.postprocessing.ion_thumbnail.image_storage'
               ) as image_storage_mock:
        image_storage_mock.post_image.return_value = IMG_ID
        image_storage_mock.get_image_url.return_value = IMG_URL
        image_storage_mock.get_ion_images_for_analysis.side_effect = (
            _mock_get_ion_images_for_analysis)

        generate_ion_thumbnail(db, ds, algorithm=algorithm)

        ion_thumbnail, ion_thumbnail_url = db.select_one(
            "SELECT ion_thumbnail, ion_thumbnail_url FROM dataset WHERE id = %s",
            [ds.id])
        assert ion_thumbnail == IMG_ID
        assert ion_thumbnail_url == IMG_URL
        assert image_storage_mock.post_image.called
    def test_annotate_ds(self, test_db, sm_config, ds_config):
        es_mock = MagicMock(spec=ESExporter)
        db = DB(sm_config['db'])
        try:
            manager = create_daemon_man(sm_config, db=db, es=es_mock)

            ds_id = '2000-01-01'
            ds_name = 'ds_name'
            input_path = 'input_path'
            upload_dt = datetime.now()
            metadata = {}
            ds = create_ds(ds_id=ds_id, ds_name=ds_name, input_path=input_path, upload_dt=upload_dt,
                           metadata=metadata, ds_config=ds_config)

            manager.annotate(ds, search_job_factory=self.SearchJob)

            DS_SEL = 'select name, input_path, upload_dt, metadata, config from dataset where id=%s'
            assert db.select_one(DS_SEL, params=(ds_id,)) == (ds_name, input_path, upload_dt, metadata, ds_config)
        finally:
            db.close()
def run(ds_id, sql_where):

    conf = SMConfig.get_conf()

    db = DB(conf['db'])
    img_store = ImageStoreServiceWrapper(conf['services']['img_service_url'])

    if sql_where:
        ds_ids = [
            id for (id,) in db.select(f'SELECT DISTINCT dataset.id FROM dataset WHERE {sql_where}')
        ]
    else:
        ds_ids = ds_id.split(',')

    if not ds_ids:
        logger.warning('No datasets match filter')
        return

    for i, ds_id in enumerate(ds_ids):
        try:
            logger.info(f'[{i+1} / {len(ds_ids)}] Updating acq geometry for {ds_id}')
            ds = Dataset.load(db, ds_id)
            (sample_img_id,) = db.select_one(
                "SELECT iim.iso_image_ids[1] from job j "
                "JOIN iso_image_metrics iim on j.id = iim.job_id "
                "WHERE j.ds_id = %s LIMIT 1",
                [ds_id],
            )
            print(sample_img_id)
            if sample_img_id:
                w, h = img_store.get_image_by_id('fs', 'iso_image', sample_img_id).size
                dims = (h, w)  # n_cols, n_rows
            else:
                dims = (None, None)

            acq_geometry = make_acq_geometry('ims', None, ds.metadata, dims)

            ds.save_acq_geometry(db, acq_geometry)

        except Exception:
            logger.error(f'Failed on {ds_id}', exc_info=True)
Esempio n. 19
0
def save_scoring_model_to_db(name, type_, params):
    """Adds/updates the scoring_model in the local database"""
    # Import DB locally so that Lithops doesn't try to pickle it & fail due to psycopg2
    # pylint: disable=import-outside-toplevel  # circular import
    from sm.engine.db import DB

    if not isinstance(params, str):
        params = json.dumps(params)

    db = DB()
    if db.select_one('SELECT * FROM scoring_model WHERE name = %s', (name, )):
        logger.info(f'Updating existing scoring model {name}')
        DB().alter(
            'UPDATE scoring_model SET type = %s, params = %s WHERE name = %s',
            (type_, params, name),
        )
    else:
        logger.info(f'Inserting new scoring model {name}')
        DB().alter(
            'INSERT INTO scoring_model(name, type, params) VALUES (%s, %s, %s)',
            (name, type_, params),
        )
    def test_annotate_ds(self, AnnotationJobMock, fill_db, metadata, ds_config):
        es_mock = MagicMock(spec=ESExporter)
        db = DB()
        manager = create_daemon_man(db=db, es=es_mock)

        ds_id = '2000-01-01'
        ds_name = 'ds_name'
        input_path = 'input_path'
        upload_dt = datetime.now()
        ds = create_ds(
            ds_id=ds_id,
            ds_name=ds_name,
            input_path=input_path,
            upload_dt=upload_dt,
            metadata=metadata,
        )

        manager.annotate(ds)

        DS_SEL = 'select name, input_path, upload_dt, metadata, config from dataset where id=%s'
        results = db.select_one(DS_SEL, params=(ds_id,))
        assert results[3] == metadata
        assert results[4] == ds_config
Esempio n. 21
0
def ensure_db_populated(sm_config, analysis_version, database):
    db = DB()
    # Install DB schema if needed
    query = "SELECT COUNT(*) FROM pg_tables WHERE schemaname = 'public' AND tablename = 'dataset'"
    tables_exist = db.select_one(query)[0] >= 1
    if not tables_exist:
        print('Installing DB schema')
        db.alter(DB_SQL_SCHEMA)

    # Import HMDB if needed
    moldb = MOL_DBS[database]
    try:
        molecular_db.find_by_name_version(moldb['name'], moldb['version'])
    except SMError:
        print(f'Importing {database}')
        with TemporaryDirectory() as tmp:
            urlretrieve(moldb['url'], f'{tmp}/moldb.tsv')
            molecular_db.create(moldb['name'], moldb['version'],
                                f'{tmp}/moldb.tsv')

    if analysis_version > 1:
        if len(
                db.select(
                    "SELECT name FROM scoring_model WHERE name = 'v3_default'")
        ) == 0:
            print("Importing v3_default scoring model")
            params = upload_catboost_scoring_model(
                model=Path(proj_root()) /
                '../scoring-models/v3_default/model-2022-01-05T13-45-26.947188-416b1311.cbm',
                bucket=sm_config['lithops']['lithops']['storage_bucket'],
                prefix=f'test_scoring_models/v3_default',
                is_public=False,
            )
            save_scoring_model_to_db(name='v3_default',
                                     type_='catboost',
                                     params=params)
Esempio n. 22
0
    parser = argparse.ArgumentParser(description='Exporting isotopic images')
    parser.add_argument('ds_name', type=str, help='Dataset name')
    parser.add_argument('sf', type=str, help='sum formula')
    parser.add_argument('pkl_path', type=str, help='Path for the cPickle file')
    parser.add_argument('--config',
                        dest='sm_config_path',
                        type=str,
                        help='SM config path')
    parser.set_defaults(
        sm_config_path=path.join(proj_root(), 'conf/config.json'))
    args = parser.parse_args()

    SMConfig.set_path(args.sm_config_path)
    db = DB(SMConfig.get_conf()['db'])

    ds_config, img_bounds = db.select_one(DS_CONFIG_SEL, args.ds_name)
    nrows, ncols = get_img_dims(img_bounds)
    isotope_gen_config = ds_config['isotope_generation']
    charge = '{}{}'.format(isotope_gen_config['charge']['polarity'],
                           isotope_gen_config['charge']['n_charges'])
    export_rs = db.select(EXPORT_SEL, args.ds_name, args.sf)

    export_df = pd.DataFrame(
        export_rs, columns=['sf', 'adduct', 'peak', 'pxl_inds', 'ints'])
    export_df['img_dims'] = [(img_bounds['y']['min'], img_bounds['y']['max'],
                              img_bounds['x']['min'], img_bounds['x']['max'])
                             ] * len(export_df)
    # export_df['img'] = export_df.apply(lambda r: build_matrix(np.array(r['pxl_inds']),
    #                                                           np.array(r['ints']), nrows, ncols), axis=1)
    # export_df.drop(['pxl_inds', 'ints'], axis=1, inplace=True)
    # export_df.to_csv(args.csv_path, index=False)
Esempio n. 23
0
def test_sm_daemon_es_export_fails(
    MSMSearchMock,
    post_images_to_image_store_mock,
    test_db,
    es_dsl_search,
    clean_isotope_storage,
    reset_queues,
    metadata,
    ds_config,
    queue_pub,
    local_sm_config,
):
    moldb = init_moldb()

    formula_metrics_df = pd.DataFrame({
        'formula_i': [0, 1, 2],
        'ion_formula': ['C12H24O-H2O+H', 'C12H24O-H2+O2-CO+Na', 'C12H24O+K'],
        'formula': ['C12H24O', 'C12H24O', 'C12H24O'],
        'modifier': ['-H2O+H', '-H2+O2-CO+Na', '+K'],
        'chem_mod': ['', '-H2+O2', ''],
        'neutral_loss': ['-H2O', '-CO', ''],
        'adduct': ['+H', '+Na', '+K'],
        'chaos': [0.9, 0.9, 0.9],
        'spatial': [0.9, 0.9, 0.9],
        'spectral': [0.9, 0.9, 0.9],
        'msm': [0.9**3, 0.9**3, 0.9**3],
        'total_iso_ints': [[100.0], [100.0], [100.0]],
        'min_iso_ints': [[0], [0], [0]],
        'max_iso_ints': [[10.0], [10.0], [10.0]],
        'fdr': [0.1, 0.1, 0.1],
    }).set_index('formula_i')
    search_algo_mock = MSMSearchMock()
    search_algo_mock.search.return_value = [
        (formula_metrics_df, [], create_test_fdr_diagnostics_bundle())
    ]
    search_algo_mock.metrics = OrderedDict([
        ('chaos', 0),
        ('spatial', 0),
        ('spectral', 0),
        ('msm', 0),
        ('total_iso_ints', []),
        ('min_iso_ints', []),
        ('max_iso_ints', []),
    ])
    image_ids = ['iso_image_1', None, None, None]
    post_images_to_image_store_mock.return_value = {
        0: image_ids,
        1: image_ids,
        2: image_ids
    }

    db = DB()

    def throw_exception_function(*args, **kwargs):
        raise Exception('Test')

    es = ESExporter(db, local_sm_config)
    es.index_ds = throw_exception_function

    ds = create_test_ds(
        name=test_ds_name,
        input_path=input_dir_path,
        config={
            **ds_config, 'database_ids': [moldb.id]
        },
        status=DatasetStatus.QUEUED,
        es=es,
    )

    queue_pub.publish({
        'ds_id': ds.id,
        'ds_name': test_ds_name,
        'action': DaemonAction.ANNOTATE
    })

    run_daemons(db, es, local_sm_config)

    # dataset and job tables asserts
    row = db.select_one('SELECT status from job')
    assert row[0] == 'FINISHED'
    row = db.select_one('SELECT status from dataset')
    assert row[0] == 'FAILED'
              "JOIN agg_formula f ON f.id = m.sf_id AND sf_db.id = f.db_id "
              "JOIN job j ON j.id = m.job_id "
              "JOIN dataset ds ON ds.id = j.ds_id "
              "JOIN theor_peaks tp ON tp.db_id = sf_db.id AND tp.sf_id = m.sf_id AND tp.adduct = m.adduct "
              "WHERE sf_db.name = %s AND ds.name = %s "
              "AND ROUND(sigma::numeric, 6) = %s AND charge = %s AND pts_per_mz = %s")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Exporting search results into a csv file')
    parser.add_argument('ds_name', type=str, help='Dataset name')
    parser.add_argument('csv_path', type=str, help='Path for the csv file')
    parser.add_argument('--config', dest='sm_config_path', type=str, help='SM config path')
    parser.set_defaults(sm_config_path=path.join(proj_root(), 'conf/config.json'))
    args = parser.parse_args()

    SMConfig.set_path(args.sm_config_path)
    db = DB(SMConfig.get_conf()['db'])

    ds_config = db.select_one(DS_CONFIG_SEL, args.ds_name)[0]
    isotope_gen_config = ds_config['isotope_generation']
    charge = '{}{}'.format(isotope_gen_config['charge']['polarity'], isotope_gen_config['charge']['n_charges'])
    export_rs = db.select(EXPORT_SEL, ds_config['database']['name'], args.ds_name,
                          isotope_gen_config['isocalc_sigma'], charge, isotope_gen_config['isocalc_pts_per_mz'])

    header = ','.join(['formula_db', 'ds_name', 'sf', 'adduct', 'chaos', 'img_corr', 'pat_match',
                       'isocalc_sigma', 'isocalc_charge', 'isocalc_pts_per_mz', 'first_peak_mz']) + '\n'
    with open(args.csv_path, 'w') as f:
        f.write(header)
        f.writelines([','.join(map(str, row)) + '\n' for row in export_rs])
    logger.info('Exported all search results for "%s" dataset into "%s" file', args.ds_name, args.csv_path)
Esempio n. 25
0
class TheorPeaksGenerator(object):
    """ Generator of theoretical isotope peaks for all molecules in a database.

    Args
    ----------
    sc : pyspark.SparkContext
    sm_config : dict
        SM engine config
    ds_config : dict
        Dataset config
    """
    def __init__(self, sc, sm_config, ds_config):
        self.sc = sc
        self.sm_config = sm_config
        self.ds_config = ds_config

        self.theor_peaks_tmp_dir = join(sm_config['fs']['base_path'],
                                        'tmp_theor_peaks_gen')
        self.db = DB(sm_config['db'])

        self.adducts = self.ds_config['isotope_generation']['adducts']

        self.isocalc_wrapper = IsocalcWrapper(
            self.ds_config['isotope_generation'])

    @staticmethod
    def _sf_elements(sf):
        return [
            seg.element().name() for seg in parseSumFormula(sf).get_segments()
        ]

    @classmethod
    def _valid_sf_adduct(cls, sf, adduct):
        if sf is None or adduct is None or sf == 'None' or adduct == 'None':
            logger.warning('Invalid sum formula or adduct: sf=%s, adduct=%s',
                           sf, adduct)
            return False

        if '-' in adduct and adduct.strip('-') not in cls._sf_elements(sf):
            logger.info(
                'No negative adduct element in the sum formula: sf=%s, adduct=%s',
                sf, adduct)
            return False

        return True

    def run(self):
        """ Starts peaks generation. Checks all formula peaks saved in the database and
        generates peaks only for new ones"""
        logger.info('Running theoretical peaks generation')

        db_id = self.db.select_one(DB_ID_SEL,
                                   self.ds_config['database']['name'])[0]
        formula_list = self.apply_database_filters(
            self.db.select(AGG_FORMULA_SEL, db_id))

        stored_sf_adduct = self.db.select(SF_ADDUCT_SEL, db_id,
                                          self.isocalc_wrapper.sigma,
                                          self.isocalc_wrapper.charge,
                                          self.isocalc_wrapper.pts_per_mz)

        sf_adduct_cand = self.find_sf_adduct_cand(formula_list,
                                                  set(stored_sf_adduct))
        logger.info('%d saved (sf, adduct)s, %s not saved (sf, adduct)s',
                    len(stored_sf_adduct), len(sf_adduct_cand))

        if sf_adduct_cand:
            self.generate_theor_peaks(sf_adduct_cand)

    def apply_database_filters(self, formula_list):
        """ Filters according to settings in dataset config

        Args
        ----
        formula_list : list
            List of pairs (id, sum formula) to search through

        Returns
        -------
        : list
            Filtered list of pairs (id, sum formula)
        """
        if 'organic' in map(lambda s: s.lower(),
                            self.ds_config['database'].get('filters', [])):
            logger.info('Organic sum formula filter has been applied')
            return filter(lambda (_, sf): 'C' in self._sf_elements(sf),
                          formula_list)
        return formula_list

    def find_sf_adduct_cand(self, formula_list, stored_sf_adduct):
        """
        Args
        ----
        formula_list : list
            List of pairs (id, sum formula) to search through
        stored_sf_adduct : set
            Set of (formula, adduct) pairs which have theoretical patterns saved in the database

        Returns
        -------
        : list
            List of (formula id, formula, adduct) triples which don't have theoretical patterns saved in the database
        """
        assert formula_list, 'Emtpy agg_formula table!'
        adducts = set(self.adducts) | set(DECOY_ADDUCTS)
        cand = [(id, sf, a) for (id, sf) in formula_list for a in adducts]
        return filter(
            lambda (sf_id, sf, adduct): (sf, adduct) not in stored_sf_adduct,
            cand)

    def generate_theor_peaks(self, sf_adduct_cand):
        """
        Args
        ----
        sf_adduct_cand : list
            List of (formula id, formula, adduct) triples which don't have theoretical patterns saved in the database

        Returns
        -------
        : list
            List of strings with formatted theoretical peaks data
        """
        logger.info('Generating missing peaks')
        formatted_iso_peaks = self.isocalc_wrapper.formatted_iso_peaks
        db_id = self.db.select_one(DB_ID_SEL,
                                   self.ds_config['database']['name'])[0]
        n = 10000
        for i in xrange(0, len(sf_adduct_cand), n):
            sf_adduct_cand_rdd = self.sc.parallelize(sf_adduct_cand[i:i + n],
                                                     numSlices=128)
            peak_lines = (sf_adduct_cand_rdd.flatMap(
                lambda (sf_id, sf, adduct): formatted_iso_peaks(
                    db_id, sf_id, sf, adduct)).collect())
            self._import_theor_peaks_to_db(peak_lines)

    def _import_theor_peaks_to_db(self, peak_lines):
        logger.info('Saving new peaks to the DB')
        if not exists(self.theor_peaks_tmp_dir):
            makedirs(self.theor_peaks_tmp_dir)

        peak_lines_path = join(self.theor_peaks_tmp_dir, 'peak_lines.csv')
        with open(peak_lines_path, 'w') as f:
            f.write('\n'.join(peak_lines))

        with open(peak_lines_path) as peaks_file:
            self.db.copy(peaks_file, 'theor_peaks')
Esempio n. 26
0
class TheorPeaksGenerator(object):
    """ Generator of theoretical isotope peaks for all molecules in a database.

    Args
    ----------
    sc : pyspark.SparkContext
    sm_config : dict
        SM engine config
    ds_config : dict
        Dataset config
    """
    def __init__(self, sc, sm_config, ds_config):  # TODO: replace sm_config with db
        self.sc = sc
        self.sm_config = sm_config
        self.ds_config = ds_config

        self.theor_peaks_tmp_dir = join(sm_config['fs']['base_path'], 'tmp_theor_peaks_gen')
        self.db = DB(sm_config['db'])

        self.adducts = self.ds_config['isotope_generation']['adducts']

        self.isocalc_wrapper = IsocalcWrapper(self.ds_config['isotope_generation'])

    @staticmethod
    def _sf_elements(sf):
        return [seg.element().name() for seg in parseSumFormula(sf).get_segments()]

    @classmethod
    def _valid_sf_adduct(cls, sf, adduct):
        if sf is None or adduct is None or sf == 'None' or adduct == 'None':
            logger.warning('Invalid sum formula or adduct: sf=%s, adduct=%s', sf, adduct)
            return False

        if '-' in adduct and adduct.strip('-') not in cls._sf_elements(sf):
            logger.info('No negative adduct element in the sum formula: sf=%s, adduct=%s', sf, adduct)
            return False

        return True

    def run(self):
        """ Starts peaks generation. Checks all formula peaks saved in the database and
        generates peaks only for new ones"""
        logger.info('Running theoretical peaks generation')

        db_id = self.db.select_one(DB_ID_SEL, self.ds_config['database']['name'])[0]
        formula_list = self.apply_database_filters(self.db.select(AGG_FORMULA_SEL, db_id))

        stored_sf_adduct = self.db.select(SF_ADDUCT_SEL, db_id,
                                          self.isocalc_wrapper.sigma,
                                          self.isocalc_wrapper.charge,
                                          self.isocalc_wrapper.pts_per_mz)

        sf_adduct_cand = self.find_sf_adduct_cand(formula_list, set(stored_sf_adduct))
        logger.info('%d saved (sf, adduct)s, %s not saved (sf, adduct)s', len(stored_sf_adduct), len(sf_adduct_cand))

        if sf_adduct_cand:
            self.generate_theor_peaks(sf_adduct_cand)

    def apply_database_filters(self, formula_list):
        """ Filters according to settings in dataset config

        Args
        ----
        formula_list : list
            List of pairs (id, sum formula) to search through

        Returns
        -------
        : list
            Filtered list of pairs (id, sum formula)
        """
        if 'organic' in map(lambda s: s.lower(), self.ds_config['database'].get('filters', [])):
            logger.info('Organic sum formula filter has been applied')
            return filter(lambda (_, sf): 'C' in self._sf_elements(sf), formula_list)
        return formula_list

    def find_sf_adduct_cand(self, formula_list, stored_sf_adduct):
        """
        Args
        ----
        formula_list : list
            List of pairs (id, sum formula) to search through
        stored_sf_adduct : set
            Set of (formula, adduct) pairs which have theoretical patterns saved in the database

        Returns
        -------
        : list
            List of (formula id, formula, adduct) triples which don't have theoretical patterns saved in the database
        """
        assert formula_list, 'Emtpy agg_formula table!'
        adducts = set(self.adducts) | set(DECOY_ADDUCTS)
        cand = [(id, sf, a) for (id, sf) in formula_list for a in adducts]
        return filter(lambda (sf_id, sf, adduct): (sf, adduct) not in stored_sf_adduct, cand)

    def generate_theor_peaks(self, sf_adduct_cand):
        """
        Args
        ----
        sf_adduct_cand : list
            List of (formula id, formula, adduct) triples which don't have theoretical patterns saved in the database

        Returns
        -------
        : list
            List of strings with formatted theoretical peaks data
        """
        logger.info('Generating missing peaks')
        formatted_iso_peaks = self.isocalc_wrapper.formatted_iso_peaks
        db_id = self.db.select_one(DB_ID_SEL, self.ds_config['database']['name'])[0]
        n = 10000
        for i in xrange(0, len(sf_adduct_cand), n):
            sf_adduct_cand_rdd = self.sc.parallelize(sf_adduct_cand[i:i+n], numSlices=128)
            peak_lines = (sf_adduct_cand_rdd
                          .flatMap(lambda (sf_id, sf, adduct): formatted_iso_peaks(db_id, sf_id, sf, adduct))
                          .collect())
            self._import_theor_peaks_to_db(peak_lines)

    def _import_theor_peaks_to_db(self, peak_lines):
        logger.info('Saving new peaks to the DB')
        if not exists(self.theor_peaks_tmp_dir):
            makedirs(self.theor_peaks_tmp_dir)

        peak_lines_path = join(self.theor_peaks_tmp_dir, 'peak_lines.csv')
        with open(peak_lines_path, 'w') as f:
            f.write('\n'.join(peak_lines))

        with open(peak_lines_path) as peaks_file:
            self.db.copy(peaks_file, 'theor_peaks')
Esempio n. 27
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='Exporting search results into a csv file')
    parser.add_argument('ds_name', type=str, help='Dataset name')
    parser.add_argument('csv_path', type=str, help='Path for the csv file')
    parser.add_argument('--config',
                        dest='sm_config_path',
                        type=str,
                        help='SM config path')
    parser.set_defaults(
        sm_config_path=path.join(proj_root(), 'conf/config.json'))
    args = parser.parse_args()

    SMConfig.set_path(args.sm_config_path)
    db = DB(SMConfig.get_conf()['db'])

    ds_config = db.select_one(DS_CONFIG_SEL, args.ds_name)[0]
    isotope_gen_config = ds_config['isotope_generation']
    charge = '{}{}'.format(isotope_gen_config['charge']['polarity'],
                           isotope_gen_config['charge']['n_charges'])
    export_rs = db.select(EXPORT_SEL, ds_config['database']['name'],
                          args.ds_name, isotope_gen_config['isocalc_sigma'],
                          charge, isotope_gen_config['isocalc_pts_per_mz'])

    header = '\t'.join(['formula_db', 'db_ids', 'sf_name', 'sf', 'adduct']) +'\t' + '\t'.join(metrics) + '\t' + \
             '\t'.join(['fdr', 'isocalc_sigma', 'isocalc_charge', 'isocalc_pts_per_mz', 'first_peak_mz']) + '\n'
    with open(args.csv_path, 'w') as f:
        f.write(header)
        f.writelines(['\t'.join(map(str, row)) + '\n' for row in export_rs])
    logger.info('Exported all search results for "%s" dataset into "%s" file',
                args.ds_name, args.csv_path)
Esempio n. 28
0
class SearchJob(object):
    """ Main class responsible for molecule search. Uses other modules of the engine.

    Args
    ----------
    ds_name : string
        A dataset short name
    """
    def __init__(self, client_email, ds_name):
        self.sm_config = SMConfig.get_conf()
        self.client_email = client_email
        self.ds_name = ds_name
        self.ds_id = None
        self.job_id = None
        self.sc = None
        self.db = None
        self.ds = None
        self.fdr = None
        self.formulas = None
        self.ds_config = None
        self.wd_manager = None

    def _read_ds_config(self):
        with open(self.wd_manager.ds_config_path) as f:
            self.ds_config = json.load(f)

    def _configure_spark(self):
        logger.info('Configuring Spark')
        sconf = SparkConf()
        for prop, value in self.sm_config['spark'].iteritems():
            if prop.startswith('spark.'):
                sconf.set(prop, value)

        if 'aws' in self.sm_config:
            sconf.set("spark.hadoop.fs.s3a.access.key",
                      self.sm_config['aws']['aws_access_key_id'])
            sconf.set("spark.hadoop.fs.s3a.secret.key",
                      self.sm_config['aws']['aws_secret_access_key'])
            sconf.set("spark.hadoop.fs.s3a.impl",
                      "org.apache.hadoop.fs.s3a.S3AFileSystem")

        # sconf.set("spark.python.profile", "true")
        self.sc = SparkContext(master=self.sm_config['spark']['master'],
                               conf=sconf,
                               appName='SM engine')
        if not self.sm_config['spark']['master'].startswith('local'):
            self.sc.addPyFile(join(local_path(proj_root()), 'sm.zip'))

    def _init_db(self):
        logger.info('Connecting to the DB')
        self.db = DB(self.sm_config['db'])
        self.sf_db_id = self.db.select_one(
            DB_ID_SEL, self.ds_config['database']['name'])[0]

    def store_job_meta(self):
        """ Store search job metadata in the database """
        logger.info('Storing job metadata')
        self.ds_id = int(self.db.select_one(DS_ID_SEL, self.ds_name)[0])
        self.job_id = self.ds_id
        self.db.alter(DEL_JOB_SQL, self.job_id)
        rows = [(self.job_id, self.sf_db_id, self.ds_id,
                 datetime.now().strftime('%Y-%m-%d %H:%M:%S'))]
        self.db.insert(JOB_INS, rows)

        rows = [(self.job_id, adduct)
                for adduct in self.ds_config['isotope_generation']['adducts']]
        self.db.insert(ADDUCT_INS, rows)

    def run(self, input_path, ds_config_path, clean=False):
        """ Entry point of the engine. Molecule search is completed in several steps:
         * Copying input data to the engine work dir
         * Conversion input data (imzML+ibd) to plain text format. One line - one spectrum data
         * Generation and saving to the database theoretical peaks for all formulas from the molecule database
         * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner.
         * Saving results (isotope images and their metrics of quality for each putative molecule) to the database

        Args
        -------
        input_path : string
            Path to the dataset folder with .imzML and .ibd files
        ds_config_path: string
            Path to the dataset config file
        clean : bool
            Clean all interim data files before starting molecule search
        """
        try:
            self.wd_manager = WorkDirManager(self.ds_name)
            if clean:
                self.wd_manager.clean()

            self.wd_manager.copy_input_data(input_path, ds_config_path)

            self._read_ds_config()
            logger.info('Dataset config:\n%s', pformat(self.ds_config))

            self._configure_spark()
            self._init_db()

            if not self.wd_manager.exists(self.wd_manager.txt_path):
                imzml_converter = ImzmlTxtConverter(
                    self.ds_name, self.wd_manager.local_dir.imzml_path,
                    self.wd_manager.local_dir.txt_path,
                    self.wd_manager.local_dir.coord_path)
                imzml_converter.convert()

                if not self.wd_manager.local_fs_only:
                    self.wd_manager.upload_to_remote()

            self.ds = Dataset(self.sc, self.ds_name, self.client_email,
                              input_path, self.ds_config, self.wd_manager,
                              self.db)
            self.ds.save_ds_meta()

            self.store_job_meta()

            theor_peaks_gen = TheorPeaksGenerator(self.sc, self.sm_config,
                                                  self.ds_config)
            theor_peaks_gen.run()

            target_adducts = self.ds_config['isotope_generation']['adducts']
            self.fdr = FDR(self.job_id,
                           self.sf_db_id,
                           decoy_sample_size=20,
                           target_adducts=target_adducts,
                           db=self.db)
            self.fdr.decoy_adduct_selection()
            self.formulas = FormulasSegm(self.job_id, self.sf_db_id,
                                         self.ds_config, self.db)

            # search_alg = MSMBasicSearch(self.sc, self.ds, self.formulas, self.fdr, self.ds_config)
            search_alg = MSMExtraFeats(self.sc, self.ds, self.formulas,
                                       self.fdr, self.ds_config)
            sf_metrics_df, sf_iso_images = search_alg.search()

            search_results = SearchResults(
                self.sf_db_id, self.ds_id, self.job_id, self.ds_name,
                self.formulas.get_sf_adduct_peaksn(), self.db, self.sm_config,
                self.ds_config)
            search_results.sf_metrics_df = sf_metrics_df
            search_results.sf_iso_images = sf_iso_images
            search_results.metrics = search_alg.metrics
            search_results.nrows, search_results.ncols = self.ds.get_dims()
            search_results.store()

            es = ESExporter(self.sm_config)
            es.index_ds(self.db, self.ds_name,
                        self.ds_config['database']['name'])

        except Exception:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            logger.error('\n'.join(
                traceback.format_exception(exc_type, exc_value,
                                           exc_traceback)))
        finally:
            if self.sc:
                # self.sc.show_profiles()
                self.sc.stop()
            if self.db:
                self.db.close()