Exemple #1
0
def _upload_imzmls_from_prefix_if_needed(src_path,
                                         storage,
                                         sm_storage,
                                         s3_client=None):
    if src_path.startswith('cos://'):
        bucket, prefix = src_path[len('cos://'):].split('/', maxsplit=1)
        keys = [
            f'cos://{bucket}/{key}'
            for key in storage.list_keys(bucket, prefix)
        ]
    elif src_path.startswith('s3a://'):
        bucket, prefix = split_s3_path(src_path)
        response = s3_client.list_objects_v2(Bucket=bucket, Prefix=prefix)
        if 'Contents' in response:
            keys = [
                f"s3a://{bucket}/{item['Key']}"
                for item in response['Contents']
            ]
        else:
            keys = []
    else:
        keys = [str(p) for p in Path(src_path).iterdir()]

    imzml_keys = [key for key in keys if key.lower().endswith('.imzml')]
    ibd_keys = [key for key in keys if key.lower().endswith('.ibd')]
    assert len(imzml_keys) == 1, imzml_keys
    assert len(ibd_keys) == 1, ibd_keys
    imzml_cobj = _upload_if_needed(imzml_keys[0], storage, sm_storage, 'imzml',
                                   s3_client)
    ibd_cobj = _upload_if_needed(ibd_keys[0], storage, sm_storage, 'imzml',
                                 s3_client)

    return imzml_cobj, ibd_cobj
def parse_input_path_for_lithops(sm_config, input_path):
    if input_path.startswith('s3://') or input_path.startswith('s3a://'):
        backend = 'aws_s3'
        bucket, prefix = split_s3_path(input_path)
    else:
        backend = 'ibm_cos'
        bucket, prefix = split_cos_path(input_path)

    storage = Storage(sm_config['lithops'], backend)
    if backend == 'aws_s3' and sm_config['lithops']['aws_s3'][
            'endpoint'].startswith('http://'):
        # WORKAROUND for local Minio access
        # Lithops forces the url to HTTPS, so overwrite the S3 client with a fixed client
        # https://github.com/lithops-cloud/lithops/issues/708
        storage.storage_handler.s3_client = get_s3_client()

    keys_in_path = storage.list_keys(bucket, prefix)
    imzml_keys = [
        key for key in keys_in_path if key.lower().endswith('.imzml')
    ]
    ibd_keys = [key for key in keys_in_path if key.lower().endswith('.ibd')]

    debug_info = f'Path {input_path} had keys: {keys_in_path}'
    assert len(
        imzml_keys) == 1, f'Couldn\'t determine imzML file. {debug_info}'
    assert len(ibd_keys) == 1, f'Couldn\'t determine ibd file. {debug_info}'

    imzml_cobject = CloudObject(storage.backend, bucket, imzml_keys[0])
    ibd_cobject = CloudObject(storage.backend, bucket, ibd_keys[0])
    return storage, imzml_cobject, ibd_cobject
def read_moldb_file(file_path):
    try:
        if re.findall(r'^s3a?://', file_path):
            bucket_name, key = split_s3_path(file_path)
            sm_config = SMConfig.get_conf()
            buffer = get_s3_bucket(bucket_name,
                                   sm_config).Object(key).get()['Body']
        else:
            buffer = Path(file_path).open()
        moldb_df = pd.read_csv(buffer, sep='\t', dtype=object, na_filter=False)
    except ValueError as e:
        raise MalformedCSV(f'Malformed CSV: {e}') from e

    if moldb_df.empty:
        raise MalformedCSV('No data rows found')

    required_columns = {'id', 'name', 'formula'}
    if not required_columns.issubset(set(moldb_df.columns)):
        raise MalformedCSV(
            f'Missing columns. Provided: {moldb_df.columns.to_list()} Required: {required_columns}'
        )

    parsing_errors = _validate_moldb_df(moldb_df)
    if parsing_errors:
        raise BadData('Failed to parse some rows', *parsing_errors)

    moldb_df.rename({
        'id': 'mol_id',
        'name': 'mol_name'
    },
                    axis='columns',
                    inplace=True)
    return moldb_df
Exemple #4
0
def load_scoring_model(name: Optional[str]) -> ScoringModel:
    # Import DB locally so that Lithops doesn't try to pickle it & fail due to psycopg2
    # pylint: disable=import-outside-toplevel  # circular import
    from sm.engine.db import DB

    if name is None:
        return MsmScoringModel()

    row = DB().select_one(
        "SELECT type, params FROM scoring_model WHERE name = %s", (name, ))
    assert row, f'Scoring model {name} not found'
    type_, params = row

    if type_ == 'catboost':
        bucket, key = split_s3_path(params['s3_path'])
        with TemporaryDirectory() as tmpdir:
            model_file = Path(tmpdir) / 'model.cbm'
            with model_file.open('wb') as f:
                f.write(get_s3_client().get_object(Bucket=bucket,
                                                   Key=key)['Body'].read())
            model = CatBoost()
            model.load_model(str(model_file), 'cbm')

        return CatBoostScoringModel(name, model, params)
    else:
        raise ValueError(f'Unsupported scoring model type: {type_}')
Exemple #5
0
 def exists(self, path):
     try:
         self.s3.Object(*split_s3_path(path)).load()
     except ClientError as e:
         if e.response['Error']['Code'] == "404":
             return False
         else:
             raise e
     else:
         logger.info('Path s3://%s/%s already exists', self.bucket, path)
         return True
 def _saved(self):
     """Check if ion centroids saved to parquet"""
     if self._centroids_stored_on_s3:
         bucket, key = split_s3_path(self._ion_centroids_path)
         try:
             for fn in self._parquet_file_names:
                 self._s3.head_object(Bucket=bucket, Key=f'{key}/{fn}')
         except ClientError:
             return False
         else:
             return True
     else:
         return all(
             (Path(self._ion_centroids_path) / fn).exists() for fn in self._parquet_file_names
         )
Exemple #7
0
    def _copy_input_data(self, ds):
        logger.info('Copying input data')
        self._ds_data_path = Path(self._sm_config['fs']['spark_data_path']) / ds.id
        if ds.input_path.startswith('s3a://'):
            self._ds_data_path.mkdir(parents=True, exist_ok=True)

            bucket_name, key = split_s3_path(ds.input_path)
            bucket = storage.get_s3_bucket(bucket_name, self._sm_config)
            for obj_sum in bucket.objects.filter(Prefix=key):
                local_file = str(self._ds_data_path / Path(obj_sum.key).name)
                logger.debug(f'Downloading s3a://{bucket_name}/{obj_sum.key} -> {local_file}')
                obj_sum.Object().download_file(local_file)
        else:
            rmtree(self._ds_data_path, ignore_errors=True)
            copytree(src=ds.input_path, dst=self._ds_data_path)
Exemple #8
0
def _upload_if_needed(src_path,
                      storage,
                      sm_storage,
                      storage_type,
                      s3_client=None,
                      use_db_mutex=True):
    """
    Uploads the object from `src_path` if it doesn't already exist in its translated COS path.
    Returns a CloudObject for the COS object
    """
    bucket, key = _choose_cos_location(src_path, sm_storage, storage_type)

    with ExitStack() as stack:
        if use_db_mutex:
            # Lock during upload to prevent parallel jobs upload the same file simultaneously
            stack.enter_context(DBMutex().lock(bucket + key, timeout=1200))

        try:
            storage.head_object(bucket, key)
            logger.debug(f'{src_path} already uploaded')
            return CloudObject(storage.backend, bucket, key)
        except StorageNoSuchKeyError:
            logger.info(f'Uploading {src_path}...')
            if src_path.startswith('s3a://'):
                assert s3_client, 'S3 client must be supplied to support s3a:// paths'
                src_bucket, src_key = split_s3_path(src_path)

                obj = s3_client.get_object(Bucket=src_bucket, Key=src_key)
                if hasattr(storage.get_client(), 'upload_fileobj'):
                    # Try streaming upload to IBM COS
                    transfer_config = TransferConfig(multipart_chunksize=20 *
                                                     MB,
                                                     max_concurrency=20,
                                                     io_chunksize=1 * MB)
                    storage.get_client().upload_fileobj(Fileobj=obj['Body'],
                                                        Bucket=bucket,
                                                        Key=key,
                                                        Config=transfer_config)
                    cobject = CloudObject(storage.backend, bucket, key)
                else:
                    # Fall back to buffering the entire object in memory for other backends
                    cobject = storage.put_cloudobject(obj['Body'].read(),
                                                      bucket, key)
            else:
                cobject = storage.put_cloudobject(open(src_path, 'rb'), bucket,
                                                  key)
            logger.info(f'Uploading {src_path}...Done')
            return cobject
 def exists(self):
     """ Check if ion centroids saved to parquet
     """
     if self._ion_centroids_path.startswith('s3a://'):
         cred_dict = dict(aws_access_key_id=self._sm_config['aws']['aws_access_key_id'],
                          aws_secret_access_key=self._sm_config['aws']['aws_secret_access_key'])
         bucket, key = split_s3_path(self._ion_centroids_path)
         s3 = boto3.client('s3', **cred_dict)
         try:
             s3.head_object(Bucket=bucket, Key=key + '/ions/_SUCCESS')
         except ClientError:
             return False
         else:
             return True
     else:
         return Path(self._ion_centroids_path + '/ions/_SUCCESS').exists()
Exemple #10
0
def _choose_cos_location(src_path, sm_storage, storage_type):
    """Maps the provided COS/S3/local filesystem path to an appropriate bucket & key in COS"""
    bucket, prefix = sm_storage[storage_type]
    src_path = str(src_path)
    if src_path.startswith('cos://'):
        # Already in COS - no need to translate path
        return split_cos_path(src_path)

    if src_path.startswith('s3a://'):
        # Ignore the bucket and take the key
        _, suffix = split_s3_path(src_path)
    else:
        # Ignore the directory and take the filename
        suffix = Path(src_path).name

    key = f'{prefix}/{suffix}' if prefix else suffix
    return bucket, key
 def exists(self):
     """ Check if ion centroids saved to parquet
     """
     if self._ion_centroids_path.startswith('s3a://'):
         cred_dict = dict(
             aws_access_key_id=self._sm_config['aws']['aws_access_key_id'],
             aws_secret_access_key=self._sm_config['aws']
             ['aws_secret_access_key'])
         bucket, key = split_s3_path(self._ion_centroids_path)
         s3 = boto3.client('s3', **cred_dict)
         try:
             s3.head_object(Bucket=bucket, Key=key + '/ions/_SUCCESS')
         except ClientError:
             return False
         else:
             return True
     else:
         return Path(self._ion_centroids_path + '/ions/_SUCCESS').exists()
Exemple #12
0
    def copy_input_data(self, input_data_path):
        """ Copy mass spec files from input path to a dataset work directory

        Args
        ----
        input_data_path : str
            Path to input files
        """
        logger.info('Copying data from %s to %s', input_data_path, self.local_dir.ds_path)

        if input_data_path.startswith('s3a://'):
            cmd_check('mkdir -p {}', self.local_dir.ds_path)
            bucket_name, inp_path = split_s3_path(input_data_path)

            bucket = self.s3.Bucket(bucket_name)
            for obj in bucket.objects.filter(Prefix=inp_path):
                if not obj.key.endswith('/'):
                    path = join(self.local_dir.ds_path, obj.key.split('/')[-1])
                    self.s3transfer.download_file(bucket_name, obj.key, path)
        else:
            self.local_dir.copy(input_data_path, self.local_dir.ds_path)
Exemple #13
0
# MZ FILE SEARCH ON S3
#
import io
import boto3
from sm.engine import util as sm_engine_utils
from sm.browser import mz_search

s3 = boto3.resource("s3")
list(s3.buckets.all())

s3_path = (
    "s3://sm-engine-dev/dataset-browser/"
    "20200228_366x629_30um_Mouse_Obesity_16w_DAN_Neg_mode_190-2000mz_70K_Laser37_6/"
    "segment_0000.bin"
)
bucket_name, key = sm_engine_utils.split_s3_path(s3_path)
mz_bin_object = s3.Object(bucket_name=bucket_name, key=key)
type(mz_bin_object), mz_bin_object.content_length
# %time _ = mz_bin_object.content_length

s3_file = mz_search.S3File(mz_bin_object)
# s3_file.seek(0)
# bytes = s3_file.read(12)
# # %time bytes = s3_file.read(12)
# mz_chunks_array = np.frombuffer(bytes, dtype="f").reshape(-1, 3)

mz = 231.0927
ppm = 3
mz_lo, mz_hi = mz - mz * ppm * 1e-6, mz + mz * ppm * 1e-6

import random
 def _upload_to_s3(self):
     bucket, key = split_s3_path(self._ion_centroids_path)
     for fn in self._parquet_file_names:
         self._s3.upload_file(
             Filename=str(self._local_ion_centroids_path / fn), Bucket=bucket, Key=f'{key}/{fn}'
         )
Exemple #15
0
 def copy(self, local, remote):
     logger.info('Coping from {} to {}'.format(local, remote))
     self.s3transfer.upload_file(local, *split_s3_path(remote))
 def _download_from_s3(self):
     bucket, key = split_s3_path(self._ion_centroids_path)
     for fn in self._parquet_file_names:
         self._s3.download_file(
             Bucket=bucket, Key=f'{key}/{fn}', Filename=str(self._local_ion_centroids_path / fn)
         )
Exemple #17
0
 def __init__(self, base_path, ds_id, s3, s3transfer):
     self.s3 = s3
     self.s3transfer = s3transfer
     self.bucket, path = split_s3_path(base_path)
     self.ds_path = join(path, ds_id)
Exemple #18
0
 def del_input_data(self, input_data_path):
     if input_data_path.startswith('s3a://'):
         bucket, path = split_s3_path(input_data_path)
         delete_s3_path(bucket, path, self.s3)
     else:
         delete_local_path(input_data_path)