async def register_sample(client: blueno.PlatformClient, dataset_name: str, sample: Dict, split_name: str, sample_no: int): """ Register the sample under file://data/{dataset_name}. The name of the sample is derived from the sample info if possible. """ start = time.time() for key in sample: if key in ('filename', 'file_name', 'image/filename', 'image/file_name'): sample_name = decode(sample[key]).replace('/', '-') break else: sample_name = f'{split_name}-{sample_no}' logging.info(f'registering sample {sample_name}') arr = sample['image'] if 'label' in sample: label = decode(sample['label']) else: label = None feature_info = {} for key in sample: if key != 'image': try: feature_info[key] = decode(sample[key]) except TypeError as e: logging.info(f"Failed to decode feature '{key}': {e}") rel_sample_path = f'data/{dataset_name}/{sample_name}.npy' abs_sample_path = pathlib.Path(FILESYSTEM_STORE_ROOT) / rel_sample_path abs_sample_path.parent.mkdir(parents=True, exist_ok=True) numpy.save(abs_sample_path, arr) client.register_sample( sample_name, dataset_name, data_url=f'file://{rel_sample_path}', image_type='2D', label=label, split=split_name, other_info={ 'feature': feature_info, }, ) end = time.time() logging.info(f'registered sample {sample_name} in {end - start} seconds')
def test_register_sample_validate(client: PlatformClient): dataset = 'blueno::test_register_sample_validate' client.create_dataset(dataset) with pytest.raises(PlatformError): client.register_sample( 'test-sample', dataset, data_url='gs://elvo-platform/test/register_validate/no-data.xzx', validate=True, split='training') assert len(client.list_samples(dataset)) == 0
def register(): platform_client = PlatformClient(API_SERVER, EMAIL, PASSWORD) platform_client.create_dataset( DATASET_NAME, description="Raw singlephase ELVO scans in NPY form.") gcs_client = storage.Client() bucket = gcs_client.get_bucket('elvo-platform') blob: storage.Blob for blob in bucket.list_blobs(prefix='elvo/raw/numpy/'): if not blob.name.endswith('.npy'): continue filename = blob.name.split('/')[-1] sample_name = filename[:-len('.npy')] gcs_url = f'gs://{bucket.name}/{blob.name}' print(f"Registering sample={sample_name} with url={gcs_url}", flush=True) start = time.time() success = platform_client.register_sample(sample_name, DATASET_NAME, gcs_url, image_type='3D') end = time.time() if success: print(f"Registered {sample_name} in {end - start} seconds") else: print(f"Found {sample_name} exists in {end - start} seconds")
def load(): client = PlatformClient(API_SERVER, EMAIL, PASSWORD) client.create_dataset(DATASET_NAME, description="Raw multiphase ELVO scans in NPZ form.") dir: str files: List[str] for dir, _, files in os.walk( '/research/rih-cs/datasets/elvo-multiphase/v1.0'): for file in files: if file.endswith('.npz'): sample_name = file.split('.')[0] label = 'positive' if file.startswith('P') else 'negative' data_url = f'{DATA_PREFIX}/{file}' print(f"Registering sample={sample_name} with" f" label={label} and url={data_url}", flush=True) start = time.time() success = client.register_sample( sample_name, DATASET_NAME, data_url=data_url, image_type='3D', label=label, ) end = time.time() if success: print(f"Registered {file} in {end - start} seconds") else: print(f"Found {file} exists in {end - start} seconds")
def test_crud_samples(client: PlatformClient): dataset = 'blueno::test_crud_samples' samples = [ 'smaple1', 'snapple2', 'water3', ] client.create_dataset(dataset) # Attempt to create sample w/o data should pass assert client.register_sample( samples[0], dataset, data_url='file://test/crud_samples/no-data.xzx', validate=False, split='training') assert len(client.list_samples(dataset)) == 1 # 2nd attempt to create w/ sample name should fail assert not client.register_sample( samples[0], dataset, data_url='file://test/crud_samples/no-data.xzx', validate=False, split='test') listed_samples = client.list_samples(dataset) assert len(listed_samples) == 1 # 2nd attempt to create w/ sample name should not change 'info' assert listed_samples[0]['info']['split'] == 'training' # Attempt to create sample w/ data should pass assert client.register_sample( name=samples[1], dataset=dataset, data_url='file://test/crud_samples/with-data.txt', validate=False, split='training') listed_samples = client.list_samples(dataset) assert len(listed_samples) == 2 # # Basic cleanup should work client.delete_sample(samples[0], dataset) client.delete_sample(samples[1], dataset) assert len(client.list_samples(dataset)) == 0 client.delete_dataset(dataset)
def register_mnist_az(): client = PlatformClient(API_SERVER, EMAIL, PASSWORD) dataset_name = f'mnist-az' client.create_dataset(dataset_name, description="MNIST on Azure in PNG form.") dir: str files: List[str] for dir, _, files in os.walk('mnist_png'): for file in files: if file.endswith('.png'): label = dir.split('/')[-1] split = dir.split('/')[1] start = time.time() sample_name = f"{file.split('.')[0]}-{split}" new_dir = dir.replace('mnist_png', 'data') data_url = f'az://ml-platform/{new_dir}/{file}' print( f"Registering {sample_name} with label {label}," f" split {split}, and data_url {data_url}", flush=True) ret = client.register_sample( sample_name, dataset_name, data_url=data_url, validate=False, label=label, split=split, ) end = time.time() if ret: print(f"REGISTERED: processed {file}" f" in {end - start} seconds") else: print(f"ALREADY EXISTS: processed {file}" f" in {end - start} seconds")