def __init__(self, pw, dataset_name): self.pywren_executor = pw self.config = self.pywren_executor.config self.storage_handler = get_ibm_cos_client(self.config) self.bucket = self.config['pywren']['storage_bucket'] self.prefix = f'metabolomics/cache/{dataset_name}'
def chunk_spectra(config, input_data, imzml_parser, coordinates): cos_client = get_ibm_cos_client(config) sp_id_to_idx = get_pixel_indices(coordinates) def chunk_size(coords, max_size=512 * 1024 ** 2): curr_sp_i = 0 sp_inds_list, mzs_list, ints_list = [], [], [] estimated_size_mb = 0 for x, y in coords: mzs_, ints_ = imzml_parser.getspectrum(curr_sp_i) mzs_, ints_ = map(np.array, [mzs_, ints_]) sp_idx = sp_id_to_idx[curr_sp_i] sp_inds_list.append(np.ones_like(mzs_) * sp_idx) mzs_list.append(mzs_) ints_list.append(ints_) estimated_size_mb += 2 * mzs_.nbytes + ints_.nbytes curr_sp_i += 1 if estimated_size_mb > max_size: yield sp_inds_list, mzs_list, ints_list sp_inds_list, mzs_list, ints_list = [], [], [] estimated_size_mb = 0 if len(sp_inds_list) > 0: yield sp_inds_list, mzs_list, ints_list def _upload_chunk(ch_i, sp_mz_int_buf): chunk = msgpack.dumps(sp_mz_int_buf) key = f'{input_data["ds_chunks"]}/{ch_i}.msgpack' size = sys.getsizeof(chunk) * (1 / 1024 ** 2) logger.info(f'Uploading spectra chunk {ch_i} - %.2f MB' % size) cos_client.put_object(Bucket=config["storage"]["ds_bucket"], Key=key, Body=chunk) logger.info(f'Spectra chunk {ch_i} finished') return key max_size = 512 * 1024 ** 2 # 512MB chunk_it = chunk_size(coordinates, max_size) futures = [] with ThreadPoolExecutor() as ex: for ch_i, chunk in enumerate(chunk_it): sp_inds_list, mzs_list, ints_list = chunk dtype = imzml_parser.mzPrecision mzs = np.concatenate(mzs_list) by_mz = np.argsort(mzs) sp_mz_int_buf = np.array([np.concatenate(sp_inds_list)[by_mz], mzs[by_mz], np.concatenate(ints_list)[by_mz]], dtype).T logger.info(f'Parsed spectra chunk {ch_i}') futures.append(ex.submit(_upload_chunk, ch_i, sp_mz_int_buf)) logger.info(f'Parsed dataset into {len(futures)} chunks') logger.info(f'Uploaded {len(futures)} dataset chunks') keys = [future.result() for future in futures] return keys
def upload_mol_db_from_file(config, bucket, key, path, force=False): ibm_cos = get_ibm_cos_client(config) try: ibm_cos.head_object(Bucket=bucket, Key=key) should_dump = force except ClientError: should_dump = True if should_dump: mol_sfs = sorted(set(pd.read_csv(path).sf)) ibm_cos.put_object(Bucket=bucket, Key=key, Body=pickle.dumps(mol_sfs))
def convert_imzml(args, config): assert args.input.endswith('.imzML') assert args.output.endswith('.txt') if args.cos_input or args.cos_output: temp_dir = TemporaryDirectory() ibm_cos = get_ibm_cos_client(config) # Download input if using COS if args.cos_input: logger.info('Downloading input files') input_bucket, input_key_imzml = args.input.split('/', 1) imzml_filename = input_key_imzml.split('/')[-1] imzml_path = str(Path(temp_dir.name) / imzml_filename) logger.info('download_file', input_bucket, input_key_imzml, imzml_path) ibm_cos.download_file(input_bucket, input_key_imzml, imzml_path) input_key_ibd = input_key_imzml[:-6] + '.ibd' ibd_path = imzml_path[:-6] + '.ibd' logger.info('download_file', input_bucket, input_key_ibd, ibd_path) ibm_cos.download_file(input_bucket, input_key_ibd, ibd_path) else: imzml_path = args.input # Generate local path for output if using COS if args.cos_output: output_bucket, output_key = args.output.split('/', 1) spectra_filename = output_key.split('/')[-1] spectra_path = str(Path(temp_dir.name) / spectra_filename) else: spectra_path = args.output coord_path = spectra_path[:-4] + '_coord.txt' logger.info('Converting to txt') logger.info('convert_imzml_to_txt', imzml_path, spectra_path, coord_path) convert_imzml_to_txt(imzml_path, spectra_path, coord_path) # Upload output if using COS if args.cos_output: logger.info('Uploading output files') logger.info('upload_file', output_bucket, output_key, spectra_path) ibm_cos.upload_file(spectra_path, output_bucket, output_key) output_key_coord = output_key[:-4] + '_coord.txt' logger.info('upload_file', output_bucket, output_key_coord, coord_path) ibm_cos.upload_file(coord_path, output_bucket, output_key_coord)
from annotation_pipeline.utils import get_ibm_cos_client, upload_to_cos if __name__ == '__main__': parser = argparse.ArgumentParser(description='Upload input data to COS', usage='') parser.add_argument('paths', type=str, nargs='+', help='path to upload [`tmp` is ignored]') parser.add_argument('--config', type=argparse.FileType('r'), default='config.json', help='config.json path') parser.add_argument('input', type=argparse.FileType('r'), default='input_config.json', help='input_config.json path') args = parser.parse_args() input_config = json.load(args.input_config) config = json.load(args.config) cos_client = get_ibm_cos_client(config) for path in args.paths: for root, dirnames, filenames in os.walk(path): for fn in filenames: f_path = f'{root}/{fn}' upload_to_cos(cos_client, f_path, config['storage']['ds_bucket'], f_path)