Beispiel #1
0
def reporting_loop(ori_ds_id,
                   db_id,
                   msms_ds_id,
                   out_path,
                   parent_and_fragment_req=True,
                   fdr_max=0.5,
                   save_image=True):
    # Access server and logon!
    sm = SMInstance(host='https://beta.metaspace2020.eu')
    sm = logon_metaspace(sm)

    # Accesses results with target db and parses
    ds = sm.dataset(id=msms_ds_id)
    results_df = ds.results(database=db_id).reset_index()
    results_df = extract_results_metaspace(msms_ds_id, results_df)

    if parent_and_fragment_req == True:
        results_df = results_df[(results_df.parent_y == 1)
                                & (results_df.n_frag_y > 0)]

    pathlib.Path(out_path + msms_ds_id + '/').mkdir(parents=True,
                                                    exist_ok=True)
    out_df = out_path + msms_ds_id + '/' + "ms2_" + msms_ds_id + "_db_" + db_id + "_ms1_" + ori_ds_id + '.pickle'
    results_df.to_pickle(out_df)

    # Loop to download images from METASPACE for datasets
    img_dict = {}
    img_dict[out_df] = dl_img(ds, msms_ds_id, db_id, fdr_max,
                              out_path + msms_ds_id + '/by_formula/',
                              save_image)

    # Loop to group ion images or arrays by formula into by parent id
    copy_by_parent(img_dict, msms_ds_id, out_df, out_path, save_image)

    return
Beispiel #2
0
def get_reference_results(metaspace_options, ds_id):
    from metaspace.sm_annotation_utils import SMInstance
    if metaspace_options.get('host'):
        sm = SMInstance(host=metaspace_options['host'])
    else:
        sm = SMInstance()
    if metaspace_options.get('password'):
        sm.login(metaspace_options['email'], metaspace_options['password'])

    ds = sm.dataset(id=ds_id)
    reference_results = (ds.results('HMDB-v4')
        .reset_index()
        .rename({'moc': 'chaos', 'rhoSpatial': 'spatial', 'rhoSpectral': 'spectral'}, axis=1))
    return reference_results[['formula', 'adduct', 'chaos', 'spatial', 'spectral', 'msm', 'fdr']]
Beispiel #3
0
def get_ds_data(ds_id, fdr):
    sm = SMInstance()
    anns = sm._gqclient.getAnnotations(
        {
            'database': 'HMDB-v4',
            'fdrLevel': fdr,
            'hasNeutralLoss': False,
            'hasChemMod': False,
            'hasHiddenAdduct': False
        }, {'ids': ds_id})
    if len(anns) > 2:
        coloc = get_coloc_matrix(anns)
        mzs = np.array([ann['mz'] for ann in anns])
        mz_range = np.min(mzs), np.max(mzs)
        mz_dict = dict([(ann['sumFormula'] + ann['adduct'], ann['mz'])
                        for ann in anns])
        return coloc, mz_range, mz_dict
    return None
Beispiel #4
0
            'hasChemMod': False,
            'hasHiddenAdduct': False
        }, {'ids': ds_id})
    if len(anns) > 2:
        coloc = get_coloc_matrix(anns)
        mzs = np.array([ann['mz'] for ann in anns])
        mz_range = np.min(mzs), np.max(mzs)
        mz_dict = dict([(ann['sumFormula'] + ann['adduct'], ann['mz'])
                        for ann in anns])
        return coloc, mz_range, mz_dict
    return None


# test_coloc, test_mz_range, test_mz_dict = get_ds_data('2019-08-24_17h34m28s')
#%% Process, Concat, Save colocalizations
sm = SMInstance()
# sm.login(**json.loads(open('/home/lachlan/.metaspace.json')))


@filecache(mru=0)
def fetch_data_from_metaspace(pol, fdr):
    all_coloc = []
    all_ranges = []
    all_mz_dict = {}
    ion_present_in_ds = set()
    datasets = [
        SMDataset(info, sm._gqclient) for info in sm._gqclient.getDatasets(
            {'polarity': 'POSITIVE' if pol == POS else 'NEGATIVE'})
    ]
    ds_ids = []
    with ProcessPoolExecutor(8) as ex:
Beispiel #5
0
It looks like the python client can't handle cases where a candidate molecule has a None url.
As an interim workaround, I'd suggest making a local copy of the results() method that you're using from here:
https://github.com/metaspace2020/metaspace/blob/master/metaspace/python-client/metaspace/sm_annotation_utils.py#L666-L711

Four changes are needed:
* Remove the self argument on line 666 so that you can call it from outside of a class
* Change records = self._gqclient.getAnnotations( to sm._gqclient.getAnnotations( on line 681 to remove the dependency on the SMDataset class
* Change self.id to the dataset id on line 683
* Remove the moleculeIds= assignment on line 695 - that's where the bug is.

'''

import pandas as pd
from metaspace.sm_annotation_utils import SMInstance
#sm = SMInstance()
sm = SMInstance(host='https://beta.metaspace2020.eu')

def results(dsid_used, database, fdr=None, coloc_with=None):
    if coloc_with:
        assert fdr
        coloc_coeff_filter = {
            'database': database,
            'colocalizedWith': coloc_with,
            'fdrLevel': fdr,
        }
        annotation_filter = coloc_coeff_filter.copy()
    else:
        coloc_coeff_filter = None
        annotation_filter = {'database': database, 'hasHiddenAdduct': True

}
to fix the code here to keep track of them.
"""

import logging
from copy import deepcopy
from pathlib import Path
from typing import Any, Tuple

from metaspace.sm_annotation_utils import SMInstance

from sm.engine.ds_config import DSConfig
from sm.fdr_engineering.rerun_datasets import reprocess_dataset_remote, wait_for_datasets

# GlobalInit() Only needed for the "_local" functions
logger = logging.getLogger(__name__)
sm_src = SMInstance()
sm_dst = SMInstance(config_path=str(Path.home() / '.metaspace.local'))
DST_SUFFIX = '_ml_training'

core_metabolome_dst_id = next(
    db.id for db in sm_dst.databases()
    if db.name == 'CoreMetabolome' and db.version == 'v3')
data_dir = Path(
    'local/ml_scoring').resolve()  # the "local" subdirectory is .gitignored
data_dir.parent.mkdir(parents=True, exist_ok=True)
dataset_ids_file = data_dir / 'dataset_ids.txt'
dataset_ids = [ds_id.strip() for ds_id in dataset_ids_file.open().readlines()]
dst_dataset_ids = [ds_id + DST_SUFFIX for ds_id in dataset_ids]

#%%
# # Test VPC
    'mz_err_rel_abserr',
    # _fdr suffix applies the FDR transformation
    'chaos_fdr',
    'spatial_fdr',
    'spectral_fdr',
    'mz_err_abs_fdr',
    'mz_err_rel_fdr',
]
#%% Download the data or load it from a local cache file
downloaded_data_file = data_dir / 'metrics_df_fdr20.parquet'
FORCE_REDOWNLOAD = False
if downloaded_data_file.exists() and not FORCE_REDOWNLOAD:
    metrics_df = pd.read_parquet(downloaded_data_file)
    logger.info(f'Loaded {downloaded_data_file}')
else:
    sm_dst = SMInstance(config_path=str(Path.home() / '.metaspace.local'))

    # ds_diags is an iterable to save temp memory
    ds_diags = get_many_fdr_diagnostics_remote(sm_dst, dst_dataset_ids)
    metrics_df = get_ranking_data(ds_diags, all_features)
    metrics_df.to_parquet(downloaded_data_file)


#%% Recalculate FDR fields
def calc_fdr_fields(df):
    target = df.target == 1.0
    target_df = df[target].copy()
    decoy_df = df[~target].copy()
    # FIXME: Remove hard-coded value 20 - should be decoy_sample_size
    decoy_sample_size = 20 / df[df.target == 1].modifier.nunique()
    add_derived_features(target_df, decoy_df, decoy_sample_size, all_features)
Beispiel #8
0
            'hasChemMod': False,
            'hasHiddenAdduct': False
        }, {'ids': ds_id})
    if len(anns) > 2:
        coloc = get_coloc_matrix(anns)
        mzs = np.array([ann['mz'] for ann in anns])
        mz_range = np.min(mzs), np.max(mzs)
        mz_dict = dict([(ann['sumFormula'] + ann['adduct'], ann['mz'])
                        for ann in anns])
        return coloc, mz_range, mz_dict
    return None


# test_coloc, test_mz_range, test_mz_dict = get_ds_data('2019-08-24_17h34m28s')
#%% Process, Concat, Save colocalizations
sm = SMInstance()
# sm.login(**json.loads(open('/home/lachlan/.metaspace.json')))

datasets = sm.datasets()


def fetch_data_from_metaspace(is_pos, coloc_filename, data_filename):
    all_coloc = []
    all_ranges = []
    all_mz_dict = {}
    ion_present_in_ds = set()
    datasets = [
        SMDataset(info, sm._gqclient) for info in sm._gqclient.getDatasets(
            {'polarity': 'POSITIVE' if is_pos else 'NEGATIVE'})
    ]
    ds_ids = []
Beispiel #9
0
MATRIX_RE = re.compile('|'.join(
    re.sub('[()]', '\\$0', k) for k in MATRIX_MAPPING.keys()))


def normalize_matrix(matrix):
    if not matrix:
        return 'Other'
    m = MATRIX_RE.search(matrix.lower())
    if m:
        return MATRIX_MAPPING[m[0]]
    return 'Other'


#%%

sm = SMInstance(
)  # Call sm.save_login() to save a credentials file to access private DSs
all_datasets = sm.datasets(status='FINISHED')

all_ds_df = pd.DataFrame({
    'ds_id':
    ds.id,
    'name':
    ds.name,
    'group':
    ds.group['shortName'] if ds.group else 'None',
    'submitter':
    ds.submitter['name'],
    # pylint: disable=protected-access
    'is_public':
    ds._info['isPublic'],
    'polarity':
Beispiel #10
0
def sm():
    return SMInstance(config_path=(Path(__file__).parent /
                                   '../../test_config').resolve())
def reprocess_dataset_remote(
    sm_src: SMInstance,
    sm_dst: SMInstance,
    src_ds_id: str,
    dst_ds_id: str,
    update_metadata_func,
    skip_existing=True,
):
    try:
        dst_ds = sm_dst.dataset(id=dst_ds_id)
        assert dst_ds.status == 'FINISHED'
        assert any(diag['type'] == DiagnosticType.FDR_RESULTS
                   for diag in dst_ds.diagnostics(False))
        existing = True
    except Exception:
        existing = False

    if skip_existing and existing:
        print(f'Skipping {dst_ds_id}\n', end=None)
        return dst_ds_id, None

    smds = sm_src.dataset(id=src_ds_id)
    ds_metadata, ds_config = update_metadata_func(smds.metadata, smds.config)

    # pylint: disable=protected-access  # There's no other clean way to get _gqclient
    gqclient_dst = sm_dst._gqclient
    graphql_response = gqclient_dst.create_dataset(
        {
            'name':
            smds.name,
            'inputPath':
            smds.s3dir,
            'metadataJson':
            json.dumps(ds_metadata),
            'databaseIds':
            ds_config['database_ids'],
            'adducts':
            ds_config['isotope_generation']['adducts'],
            'neutralLosses':
            ds_config['isotope_generation']['neutral_losses'],
            'chemMods':
            ds_config['isotope_generation']['chem_mods'],
            'ppm':
            ds_config['image_generation']['ppm'],
            'numPeaks':
            ds_config['isotope_generation']['n_peaks'],
            'decoySampleSize':
            ds_config['fdr']['decoy_sample_size'],
            'analysisVersion':
            ds_config['analysis_version'],
            'submitterId':
            sm_dst.current_user_id(),
            'groupId':
            gqclient_dst.get_primary_group_id(),
            # 'projectIds': project_ids,
            'isPublic':
            False,
            'scoringModel':
            ds_config['fdr'].get('scoring_model'),
            'computeUnusedMetrics':
            ds_config['image_generation']['compute_unused_metrics'],
        },
        ds_id=dst_ds_id,  # Requires admin account
    )
    return json.loads(graphql_response)['datasetId']
Beispiel #12
0
    def post(self, study_id):
        log_request(request)
        # param validation
        if study_id is None:
            abort(404, 'Please provide valid parameter for study identifier')
        study_id = study_id.upper()

        # User authentication
        user_token = None
        if "user_token" in request.headers:
            user_token = request.headers["user_token"]

        # check for access rights
        is_curator, read_access, write_access, obfuscation_code, study_location, release_date, submission_date, \
            study_status = wsc.get_permissions(study_id, user_token)
        if not write_access:
            abort(403)

        investigation = None
        metaspace_projects = None
        metaspace_api_key = None
        metaspace_password = None
        metaspace_email = None
        metaspace_datasets = None

        # body content validation
        if request.data:
            try:
                data_dict = json.loads(request.data.decode('utf-8'))
                project = data_dict['project']
                if project:
                    if "metaspace-api-key" in project:
                        metaspace_api_key = project['metaspace-api-key']
                    if "metaspace-password" in project:
                        metaspace_password = project['metaspace-password']
                    if "metaspace-email" in project:
                        metaspace_email = project['metaspace-email']
                    if "metaspace-datasets" in project:
                        metaspace_datasets = project['metaspace-datasets']
                        logger.info('Requesting METASPACE datasets ' + metaspace_datasets)
                    if "metaspace-projects" in project:
                        metaspace_projects = project['metaspace-projects']
                        logger.info('Requesting METASPACE projects ' + metaspace_projects)

                    # study_location = os.path.join(study_location, 'METASPACE')

                    sm = SMInstance()
                    if metaspace_api_key:
                        """
                        Log in with API key
                        Users can generate an API key in the "API access" section of https://metaspace2020.eu/user/me
                        If you're connecting to our GraphQL API directly, API key authentication requires an HTTP 
                        header "Authorization: Api-Key " followed by the key. """
                        sm.login(email=None, password=None, api_key=metaspace_api_key)
                        # logged_id = sm.logged_in
                    elif metaspace_password and metaspace_email:
                        sm.login(email=metaspace_email, password=metaspace_password, api_key=None)
                    else:
                        abort(406, "No METASPACE API key or username/password provided.")

                    if not os.path.isdir(study_location):
                        os.makedirs(study_location, exist_ok=True)

                    # Annotate the METASPACE project and return all relevant dataset and project ids
                    metaspace_project_ids, metaspace_dataset_ids = \
                        annotate_metaspace(study_id=study_id,
                                           sm=sm,
                                           metaspace_projects=metaspace_projects,
                                           metaspace_datasets=metaspace_datasets)

                    investigation = import_metaspace(study_id=study_id,
                                                     dataset_ids=metaspace_dataset_ids,
                                                     study_location=study_location,
                                                     user_token=user_token,
                                                     obfuscation_code=obfuscation_code,
                                                     sm_instance=sm)
            except KeyError:
                abort(406, "No 'project' parameter was provided.")
            except AttributeError as e:
                abort(417, "Missing attribute/element in JSON string" + str(e))
            except Exception as e:
                abort(417, str(e))

        if investigation:
            return {"Success": "METASPACE data imported successfully"}
        else:
            return {"Warning": "Please check if METASPACE data was successfully imported"}