def pull_objects(self, dataset: Dataset, objects: List[PullObject], progress_update_fn: Callable) -> PullResult: """High-level method to simply link files from the source dir to the object directory to the revision directory Args: dataset: The current dataset objects: A list of PullObjects the enumerate objects to push progress_update_fn: A callable with arg "completed_bytes" (int) indicating how many bytes have been downloaded in since last called Returns: PullResult """ # Link from local data directory to the object directory for obj in objects: if os.path.exists(obj.object_path): # Re-link to make 100% sure all links are consistent if a link already exists os.remove(obj.object_path) os.link(os.path.join(self._get_local_data_dir(), obj.dataset_path), obj.object_path) progress_update_fn(os.path.getsize(obj.object_path)) # link from object dir through to revision dir m = Manifest(dataset, self.configuration.get('username')) m.link_revision() return PullResult( success=objects, failure=[], message= "Linked data directory. All files from the manifest should be available" )
def test_update_from_local(self, mock_dataset_with_local_dir): ds = mock_dataset_with_local_dir[0] assert ds.backend.can_update_from_remote() is True m = Manifest(ds, 'tester') assert len(m.manifest.keys()) == 0 ds.backend.update_from_remote(ds, updater) m = Manifest(ds, 'tester') assert len(m.manifest.keys()) == 4 assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test1.txt')) assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test2.txt')) assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'subdir', 'test3.txt')) modified_items = ds.backend.verify_contents(ds, updater) assert len(modified_items) == 0 test_dir = os.path.join(mock_dataset_with_local_dir[1], "local_data", "test_dir") with open(os.path.join(test_dir, 'test1.txt'), 'wt') as tf: tf.write("This file got changed in the filesystem") modified_items = ds.backend.verify_contents(ds, updater) assert len(modified_items) == 1 assert 'test1.txt' in modified_items ds.backend.update_from_local(ds, updater, verify_contents=True) assert len(m.manifest.keys()) == 4 assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test1.txt')) assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test2.txt')) assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'subdir', 'test3.txt')) modified_items = ds.backend.verify_contents(ds, updater) assert len(modified_items) == 0 with open( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test1.txt'), 'rt') as tf: assert tf.read() == "This file got changed in the filesystem"
def test_update_from_remote(self, mock_config_class, mock_public_bucket): im = mock_config_class[0] ds = im.create_dataset(USERNAME, USERNAME, 'dataset-1', description="my dataset 1", storage_type="public_s3_bucket") ds.backend.set_default_configuration(USERNAME, 'fakebearertoken', 'fakeidtoken') assert ds.backend.can_update_from_remote() is True m = Manifest(ds, USERNAME) assert len(m.manifest.keys()) == 0 # Configure backend completely current_config = ds.backend_config current_config['Bucket Name'] = mock_public_bucket current_config['Prefix'] = "" ds.backend_config = current_config # Trigger update ds.backend.update_from_remote(ds, updater) m = Manifest(ds, USERNAME) assert len(m.manifest.keys()) == 7 assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test-file-1.bin')) assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test-file-2.bin')) assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'metadata/test-file-3.bin')) assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'metadata/test-file-4.bin')) assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'metadata/sub/test-file-5.bin')) with open( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test-file-1.bin'), 'rt') as tf: data = tf.read() assert data[0:4] == 'asdf' with open( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'metadata/test-file-4.bin'), 'rt') as tf: data = tf.read() assert data[0:4] == '1234'
def pull_objects(self, dataset: Dataset, objects: List[PullObject], progress_update_fn: Callable) -> PullResult: """High-level method to simply link files from the source dir to the object directory to the revision directory Args: dataset: The current dataset objects: A list of PullObjects the enumerate objects to push progress_update_fn: A callable with arg "completed_bytes" (int) indicating how many bytes have been downloaded in since last called Returns: PullResult """ client = self._get_client() bucket, prefix = self._get_s3_config() backend_config = dataset.client_config.config['datasets']['backends'][dataset.backend.storage_type] chunk_size = backend_config['download_chunk_size'] success = list() failure = list() message = f"Downloaded {len(objects)} objects successfully." for obj in objects: # Get object response = client.get_object(Bucket=bucket, Key=os.path.join(prefix, obj.dataset_path)) if response['ResponseMetadata']['HTTPStatusCode'] == 200: # Save file with open(obj.object_path, 'wb') as out_file: for cnt, chunk in enumerate(response['Body'].iter_chunks(chunk_size=chunk_size)): out_file.write(chunk) progress_update_fn(len(chunk)) success.append(obj) else: failure.append(obj) if len(failure) > 0: message = f"Downloaded {len(success)} objects successfully, but {len(failure)} failed. Check results." # link from object dir through to revision dir m = Manifest(dataset, self.configuration.get('username')) m.link_revision() return PullResult(success=success, failure=failure, message=message)
def test_update_from_remote_backend_change(self, mock_config_class, mock_public_bucket): im = mock_config_class[0] ds = im.create_dataset(USERNAME, USERNAME, 'dataset-1', description="my dataset 1", storage_type="public_s3_bucket") ds.backend.set_default_configuration(USERNAME, 'fakebearertoken', 'fakeidtoken') assert ds.backend.can_update_from_remote() is True m = Manifest(ds, USERNAME) assert len(m.manifest.keys()) == 0 # Configure backend completely current_config = ds.backend_config current_config['Bucket Name'] = mock_public_bucket current_config['Prefix'] = "" ds.backend_config = current_config # Trigger update ds.backend.update_from_remote(ds, updater) m = Manifest(ds, USERNAME) assert len(m.manifest.keys()) == 7 modified_items = ds.backend.verify_contents(ds, updater) assert len(modified_items) == 0 with tempfile.NamedTemporaryFile('wt') as tf: conn = boto3.resource('s3', region_name='us-east-1') tf.write("This file has been updated!") tf.seek(0) conn.meta.client.upload_file(tf.name, mock_public_bucket, 'test-file-1.bin') ds.backend.update_from_remote(ds, updater) assert len(m.manifest.keys()) == 7 modified_items = ds.backend.verify_contents(ds, updater) assert len(modified_items) == 0 with open( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test-file-1.bin'), 'rt') as tf: assert tf.read() == "This file has been updated!"
def test_update_from_remote(self, mock_dataset_with_local_dir): ds = mock_dataset_with_local_dir[0] assert ds.backend.can_update_from_remote() is True m = Manifest(ds, 'tester') assert len(m.manifest.keys()) == 0 ds.backend.update_from_remote(ds, updater) m = Manifest(ds, 'tester') assert len(m.manifest.keys()) == 4 assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test1.txt')) assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test2.txt')) assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'subdir', 'test3.txt'))
def verify_contents(self, dataset, status_update_fn: Callable) -> List[str]: """Method to verify the hashes of all local files and indicate if they have changed Args: dataset: Dataset object status_update_fn: A callable, accepting a string for logging/providing status to the UI Returns: list """ if 'username' not in self.configuration: raise ValueError("Dataset storage backend requires current logged in username to verify contents") m = Manifest(dataset, self.configuration.get('username')) keys_to_verify = list() for item in m.manifest: if os.path.isfile(os.path.join(m.cache_mgr.cache_root, m.dataset_revision, item)): # File exists locally keys_to_verify.append(item) # re-hash files status_update_fn(f"Validating contents of {len(keys_to_verify)} files. Please wait.") updated_hashes = self.hash_file_key_list(dataset, keys_to_verify) modified_items = list() for key, new_hash in zip(keys_to_verify, updated_hashes): item = m.manifest.get(key) if item: if new_hash != item.get('h'): modified_items.append(key) if modified_items: status_update_fn(f"Integrity check complete. {len(modified_items)} files have been modified.") else: status_update_fn(f"Integrity check complete. No files have been modified.") return modified_items
def update_from_remote(self, dataset, status_update_fn: Callable) -> None: """Optional method that updates the dataset by comparing against the remote. Not all unmanaged dataset backends will be able to do this. Args: dataset: Dataset object status_update_fn: A callable, accepting a string for logging/providing status to the UI Returns: None """ if 'username' not in self.configuration: raise ValueError("Dataset storage backend requires current logged in username to verify contents") m = Manifest(dataset, self.configuration.get('username')) # Walk remote checking etags with cached versions etag_data = self._load_etag_data(dataset) bucket, prefix = self._get_s3_config() client = self._get_client() paginator = client.get_paginator('list_objects_v2') response_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix) all_files = list() added_files = list() modified_files = list() print_cnt = 0 revision_dir = os.path.join(m.cache_mgr.cache_root, m.dataset_revision) for x in response_iterator: if print_cnt == 0: status_update_fn("Processing Bucket Contents, please wait.") print_cnt += 1 elif print_cnt == 1: status_update_fn("Processing Bucket Contents, please wait..") print_cnt += 1 else: status_update_fn("Processing Bucket Contents, please wait...") print_cnt = 0 for item in x.get("Contents"): key = item['Key'] all_files.append(key) if key in m.manifest: # Object already tracked if etag_data[key] != item['ETag']: # Object has been modified since last update modified_files.append(key) if os.path.exists(os.path.join(revision_dir, key)): # Delete current version os.remove(os.path.join(revision_dir, key)) if key[-1] == "/": # is a "directory os.makedirs(os.path.join(revision_dir, key), exist_ok=True) else: client.download_file(bucket, key, os.path.join(revision_dir, key)) else: # New Object etag_data[key] = item['ETag'] added_files.append(key) if key[-1] == "/": # is a "directory os.makedirs(os.path.join(revision_dir, key), exist_ok=True) else: os.makedirs(os.path.dirname(os.path.join(revision_dir, key)), exist_ok=True) client.download_file(bucket, key, os.path.join(revision_dir, key)) deleted_files = sorted(list(set(m.manifest.keys()).difference(all_files))) # Create StatusResult to force modifications status = StatusResult(created=added_files, modified=modified_files, deleted=deleted_files) self._save_etag_data(dataset, etag_data) # Run local update self.update_from_local(dataset, status_update_fn, status_result=status)
def update_from_remote(self, dataset, status_update_fn: Callable) -> None: """Optional method that updates the dataset by comparing against the remote. Not all unmanaged dataset backends will be able to do this. Args: dataset: Dataset object status_update_fn: A callable, accepting a string for logging/providing status to the UI Returns: None """ if 'username' not in self.configuration: raise ValueError( "Dataset storage backend requires current logged in username to verify contents" ) m = Manifest(dataset, self.configuration.get('username')) # walk the local source dir, looking for additions/deletions all_files = list() added_files = list() local_data_dir = self._get_local_data_dir() os.makedirs(os.path.join(m.cache_mgr.cache_root, m.dataset_revision), exist_ok=True) for root, dirs, files in os.walk(local_data_dir): _, folder = root.split(local_data_dir) if len(folder) > 0: if folder[0] == os.path.sep: folder = folder[1:] for d in dirs: # TODO: Check for ignored rel_path = os.path.join( folder, d ) + os.path.sep # All folders are represented with a trailing slash all_files.append(rel_path) if rel_path not in m.manifest: added_files.append(rel_path) # Create dir in current revision for linking to work os.makedirs(os.path.join(m.cache_mgr.cache_root, m.dataset_revision, rel_path), exist_ok=True) for file in files: # TODO: Check for ignored if file in ['.smarthash', '.DS_STORE', '.DS_Store']: continue rel_path = os.path.join(folder, file) all_files.append(rel_path) if rel_path not in m.manifest: added_files.append(rel_path) # Symlink into current revision for downstream linking to work logger.warning(os.path.join(root, file)) if not os.path.exists( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, rel_path)): os.link( os.path.join(root, file), os.path.join(m.cache_mgr.cache_root, m.dataset_revision, rel_path)) # TODO: THINK ABOUT HERE....DO YOU NEED TO RUN THE MANIFEST LINKING HERE (but manifest not populated yet i'm pretty sure) deleted_files = sorted( list(set(m.manifest.keys()).difference(all_files))) # Create StatusResult to force modifications status = StatusResult(created=added_files, modified=[], deleted=deleted_files) # Link the revision dir m.link_revision() # Run local update self.update_from_local(dataset, status_update_fn, status_result=status, verify_contents=True)
def test_pull(self, mock_dataset_with_local_dir): def chunk_update_callback(completed_bytes: int): """Method to update the job's metadata and provide feedback to the UI""" assert type(completed_bytes) == int assert completed_bytes > 0 ds = mock_dataset_with_local_dir[0] m = Manifest(ds, 'tester') assert len(m.manifest.keys()) == 0 ds.backend.update_from_remote(ds, updater) m = Manifest(ds, 'tester') # Remove revision dir shutil.rmtree(os.path.join(m.cache_mgr.cache_root, m.dataset_revision)) keys = ['test1.txt', 'test2.txt', 'subdir/test3.txt'] pull_objects = list() for key in keys: pull_objects.append( PullObject(object_path=m.dataset_to_object_path(key), revision=m.dataset_revision, dataset_path=key)) # Remove objects os.remove(m.dataset_to_object_path(key)) assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test1.txt')) is False assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test2.txt')) is False assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'subdir', 'test3.txt')) is False for key in keys: assert os.path.isfile(m.dataset_to_object_path(key)) is False # Pull 1 File ds.backend.pull_objects(ds, [pull_objects[0]], chunk_update_callback) assert os.path.isdir( os.path.join(m.cache_mgr.cache_root, m.dataset_revision)) assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test1.txt')) is True assert os.path.isfile(m.dataset_to_object_path('test1.txt')) is True # Pull all Files ds.backend.pull_objects(ds, pull_objects, chunk_update_callback) assert os.path.isdir( os.path.join(m.cache_mgr.cache_root, m.dataset_revision)) assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test1.txt')) is True assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test2.txt')) is True assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'subdir', 'test3.txt')) is True for key in keys: assert os.path.isfile(m.dataset_to_object_path(key)) is True
def test_pull(self, mock_config_class, mock_public_bucket): im = mock_config_class[0] ds = im.create_dataset(USERNAME, USERNAME, 'dataset-1', description="my dataset 1", storage_type="public_s3_bucket") ds.backend.set_default_configuration(USERNAME, 'fakebearertoken', 'fakeidtoken') # Configure backend completely current_config = ds.backend_config current_config['Bucket Name'] = mock_public_bucket current_config['Prefix'] = "" ds.backend_config = current_config ds.backend.update_from_remote(ds, updater) m = Manifest(ds, 'tester') # Remove revision dir and objects from cache shutil.rmtree(os.path.join(m.cache_mgr.cache_root, m.dataset_revision)) keys = [ 'test-file-1.bin', 'metadata/test-file-3.bin', 'metadata/sub/test-file-5.bin' ] pull_objects = list() for key in keys: pull_objects.append( PullObject(object_path=m.dataset_to_object_path(key), revision=m.dataset_revision, dataset_path=key)) # Remove objects os.remove(m.dataset_to_object_path(key)) assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test-file-1.bin')) is False assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'metadata', 'test-file-3.bin')) is False assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'metadata', 'sub', 'test-file-5.bin')) is False for key in keys: assert os.path.isfile(m.dataset_to_object_path(key)) is False # Pull 1 File (duplicate contents so 2 files show up) ds.backend.pull_objects(ds, [pull_objects[0]], chunk_update_callback) assert os.path.isdir( os.path.join(m.cache_mgr.cache_root, m.dataset_revision)) assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test-file-1.bin')) is True assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test-file-2.bin')) is True assert os.path.isfile( m.dataset_to_object_path('test-file-1.bin')) is True assert os.path.isfile( m.dataset_to_object_path('test-file-2.bin')) is True # Pull all Files ds.backend.pull_objects(ds, pull_objects, chunk_update_callback) assert os.path.isdir( os.path.join(m.cache_mgr.cache_root, m.dataset_revision)) assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test-file-1.bin')) is True assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test-file-2.bin')) is True assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'metadata', 'test-file-3.bin')) is True assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'metadata', 'test-file-4.bin')) is True assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'metadata', 'sub', 'test-file-5.bin')) is True for key in keys: assert os.path.isfile(m.dataset_to_object_path(key)) is True
def update_from_local(self, dataset, status_update_fn: Callable, verify_contents: bool = False, status_result: Optional[StatusResult] = None) -> None: """Method to update the dataset manifest for changed files that exists locally Args: dataset: Dataset object status_update_fn: A callable, accepting a string for logging/providing status to the UI verify_contents: Boolean indicating if "verify_contents" should be run, and the results added to modified status_result: Optional StatusResult object to include in the update (typically from update_from_remote()) Returns: None """ if 'username' not in self.configuration: raise ValueError("Dataset storage backend requires current logged in username to verify contents") m = Manifest(dataset, self.configuration.get('username')) status_update_fn("Updating Dataset manifest from local file state.") if status_result is not None: if status_result.modified is not None: modified_keys = copy.deepcopy(status_result.modified) else: modified_keys = list() else: modified_keys = list() if verify_contents: modified_keys.extend(self.verify_contents(dataset, status_update_fn)) # Create StatusResult to force modifications if status_result: created_result = copy.deepcopy(status_result.created) # Check if any directories got created for key in status_result.created: if key[-1] != '/': # a file if os.path.dirname(key) not in m.manifest: # Add the directory to the manifest created_result.append(f"{os.path.dirname(key)}/") created_result = list(set(created_result)) if '/' in created_result: created_result.remove('/') # Combine a previous StatusResult object (typically from "update_from_remote") status = StatusResult(created=created_result, modified=modified_keys, deleted=status_result.deleted) else: status = StatusResult(created=[], modified=modified_keys, deleted=[]) # Update the manifest previous_revision = m.dataset_revision m.update(status) m.create_update_activity_record(status) # Link the revision dir m.link_revision() if os.path.isdir(os.path.join(m.cache_mgr.cache_root, previous_revision)): shutil.rmtree(os.path.join(m.cache_mgr.cache_root, previous_revision)) status_update_fn("Update complete.")
def hash_file_key_list(self, dataset, keys): m = Manifest(dataset, self.configuration.get('username')) loop = get_event_loop() hash_task = asyncio.ensure_future(m.hasher.hash(keys)) loop.run_until_complete(asyncio.gather(hash_task)) return hash_task.result()