def __init__(self, pw, dataset_name):
        self.pywren_executor = pw
        self.config = self.pywren_executor.config

        self.storage_handler = get_ibm_cos_client(self.config)
        self.bucket = self.config['pywren']['storage_bucket']
        self.prefix = f'metabolomics/cache/{dataset_name}'
def chunk_spectra(config, input_data, imzml_parser, coordinates):
    cos_client = get_ibm_cos_client(config)
    sp_id_to_idx = get_pixel_indices(coordinates)

    def chunk_size(coords, max_size=512 * 1024 ** 2):
        curr_sp_i = 0
        sp_inds_list, mzs_list, ints_list = [], [], []

        estimated_size_mb = 0
        for x, y in coords:
            mzs_, ints_ = imzml_parser.getspectrum(curr_sp_i)
            mzs_, ints_ = map(np.array, [mzs_, ints_])
            sp_idx = sp_id_to_idx[curr_sp_i]
            sp_inds_list.append(np.ones_like(mzs_) * sp_idx)
            mzs_list.append(mzs_)
            ints_list.append(ints_)
            estimated_size_mb += 2 * mzs_.nbytes + ints_.nbytes
            curr_sp_i += 1
            if estimated_size_mb > max_size:
                yield sp_inds_list, mzs_list, ints_list
                sp_inds_list, mzs_list, ints_list = [], [], []
                estimated_size_mb = 0

        if len(sp_inds_list) > 0:
            yield sp_inds_list, mzs_list, ints_list

    def _upload_chunk(ch_i, sp_mz_int_buf):
        chunk = msgpack.dumps(sp_mz_int_buf)
        key = f'{input_data["ds_chunks"]}/{ch_i}.msgpack'
        size = sys.getsizeof(chunk) * (1 / 1024 ** 2)
        logger.info(f'Uploading spectra chunk {ch_i} - %.2f MB' % size)
        cos_client.put_object(Bucket=config["storage"]["ds_bucket"],
                              Key=key,
                              Body=chunk)
        logger.info(f'Spectra chunk {ch_i} finished')
        return key

    max_size = 512 * 1024 ** 2  # 512MB
    chunk_it = chunk_size(coordinates, max_size)

    futures = []
    with ThreadPoolExecutor() as ex:
        for ch_i, chunk in enumerate(chunk_it):
            sp_inds_list, mzs_list, ints_list = chunk
            dtype = imzml_parser.mzPrecision
            mzs = np.concatenate(mzs_list)
            by_mz = np.argsort(mzs)
            sp_mz_int_buf = np.array([np.concatenate(sp_inds_list)[by_mz],
                                      mzs[by_mz],
                                      np.concatenate(ints_list)[by_mz]], dtype).T

            logger.info(f'Parsed spectra chunk {ch_i}')
            futures.append(ex.submit(_upload_chunk, ch_i, sp_mz_int_buf))

        logger.info(f'Parsed dataset into {len(futures)} chunks')

    logger.info(f'Uploaded {len(futures)} dataset chunks')
    keys = [future.result() for future in futures]
    return keys
Example #3
0
def upload_mol_db_from_file(config, bucket, key, path, force=False):
    ibm_cos = get_ibm_cos_client(config)
    try:
        ibm_cos.head_object(Bucket=bucket, Key=key)
        should_dump = force
    except ClientError:
        should_dump = True

    if should_dump:
        mol_sfs = sorted(set(pd.read_csv(path).sf))
        ibm_cos.put_object(Bucket=bucket, Key=key, Body=pickle.dumps(mol_sfs))
Example #4
0
def convert_imzml(args, config):
    assert args.input.endswith('.imzML')
    assert args.output.endswith('.txt')

    if args.cos_input or args.cos_output:
        temp_dir = TemporaryDirectory()
        ibm_cos = get_ibm_cos_client(config)

    # Download input if using COS
    if args.cos_input:
        logger.info('Downloading input files')
        input_bucket, input_key_imzml = args.input.split('/', 1)
        imzml_filename = input_key_imzml.split('/')[-1]
        imzml_path = str(Path(temp_dir.name) / imzml_filename)

        logger.info('download_file', input_bucket, input_key_imzml, imzml_path)
        ibm_cos.download_file(input_bucket, input_key_imzml, imzml_path)

        input_key_ibd = input_key_imzml[:-6] + '.ibd'
        ibd_path = imzml_path[:-6] + '.ibd'
        logger.info('download_file', input_bucket, input_key_ibd, ibd_path)
        ibm_cos.download_file(input_bucket, input_key_ibd, ibd_path)
    else:
        imzml_path = args.input

    # Generate local path for output if using COS
    if args.cos_output:
        output_bucket, output_key = args.output.split('/', 1)
        spectra_filename = output_key.split('/')[-1]
        spectra_path = str(Path(temp_dir.name) / spectra_filename)
    else:
        spectra_path = args.output
    coord_path = spectra_path[:-4] + '_coord.txt'

    logger.info('Converting to txt')
    logger.info('convert_imzml_to_txt', imzml_path, spectra_path, coord_path)
    convert_imzml_to_txt(imzml_path, spectra_path, coord_path)

    # Upload output if using COS
    if args.cos_output:
        logger.info('Uploading output files')
        logger.info('upload_file', output_bucket, output_key, spectra_path)
        ibm_cos.upload_file(spectra_path, output_bucket, output_key)

        output_key_coord = output_key[:-4] + '_coord.txt'
        logger.info('upload_file', output_bucket, output_key_coord, coord_path)
        ibm_cos.upload_file(coord_path, output_bucket, output_key_coord)
from annotation_pipeline.utils import get_ibm_cos_client, upload_to_cos

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Upload input data to COS',
                                     usage='')
    parser.add_argument('paths',
                        type=str,
                        nargs='+',
                        help='path to upload [`tmp` is ignored]')
    parser.add_argument('--config',
                        type=argparse.FileType('r'),
                        default='config.json',
                        help='config.json path')
    parser.add_argument('input',
                        type=argparse.FileType('r'),
                        default='input_config.json',
                        help='input_config.json path')
    args = parser.parse_args()

    input_config = json.load(args.input_config)
    config = json.load(args.config)
    cos_client = get_ibm_cos_client(config)

    for path in args.paths:
        for root, dirnames, filenames in os.walk(path):
            for fn in filenames:
                f_path = f'{root}/{fn}'
                upload_to_cos(cos_client, f_path,
                              config['storage']['ds_bucket'], f_path)