Example #1
0
    def pull_objects(self, dataset: Dataset, objects: List[PullObject],
                     progress_update_fn: Callable) -> PullResult:
        """High-level method to simply link files from the source dir to the object directory to the revision directory

        Args:
            dataset: The current dataset
            objects: A list of PullObjects the enumerate objects to push
            progress_update_fn: A callable with arg "completed_bytes" (int) indicating how many bytes have been
                                downloaded in since last called

        Returns:
            PullResult
        """
        # Link from local data directory to the object directory
        for obj in objects:
            if os.path.exists(obj.object_path):
                # Re-link to make 100% sure all links are consistent if a link already exists
                os.remove(obj.object_path)
            os.link(os.path.join(self._get_local_data_dir(), obj.dataset_path),
                    obj.object_path)
            progress_update_fn(os.path.getsize(obj.object_path))

        # link from object dir through to revision dir
        m = Manifest(dataset, self.configuration.get('username'))
        m.link_revision()

        return PullResult(
            success=objects,
            failure=[],
            message=
            "Linked data directory. All files from the manifest should be available"
        )
Example #2
0
    def test_update_from_local(self, mock_dataset_with_local_dir):
        ds = mock_dataset_with_local_dir[0]

        assert ds.backend.can_update_from_remote() is True

        m = Manifest(ds, 'tester')
        assert len(m.manifest.keys()) == 0

        ds.backend.update_from_remote(ds, updater)

        m = Manifest(ds, 'tester')
        assert len(m.manifest.keys()) == 4
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test1.txt'))
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test2.txt'))
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'subdir',
                         'test3.txt'))

        modified_items = ds.backend.verify_contents(ds, updater)
        assert len(modified_items) == 0

        test_dir = os.path.join(mock_dataset_with_local_dir[1], "local_data",
                                "test_dir")
        with open(os.path.join(test_dir, 'test1.txt'), 'wt') as tf:
            tf.write("This file got changed in the filesystem")

        modified_items = ds.backend.verify_contents(ds, updater)
        assert len(modified_items) == 1
        assert 'test1.txt' in modified_items

        ds.backend.update_from_local(ds, updater, verify_contents=True)
        assert len(m.manifest.keys()) == 4
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test1.txt'))
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test2.txt'))
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'subdir',
                         'test3.txt'))

        modified_items = ds.backend.verify_contents(ds, updater)
        assert len(modified_items) == 0

        with open(
                os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                             'test1.txt'), 'rt') as tf:
            assert tf.read() == "This file got changed in the filesystem"
    def test_update_from_remote(self, mock_config_class, mock_public_bucket):
        im = mock_config_class[0]
        ds = im.create_dataset(USERNAME,
                               USERNAME,
                               'dataset-1',
                               description="my dataset 1",
                               storage_type="public_s3_bucket")
        ds.backend.set_default_configuration(USERNAME, 'fakebearertoken',
                                             'fakeidtoken')

        assert ds.backend.can_update_from_remote() is True

        m = Manifest(ds, USERNAME)
        assert len(m.manifest.keys()) == 0

        # Configure backend completely
        current_config = ds.backend_config
        current_config['Bucket Name'] = mock_public_bucket
        current_config['Prefix'] = ""
        ds.backend_config = current_config

        # Trigger update
        ds.backend.update_from_remote(ds, updater)

        m = Manifest(ds, USERNAME)
        assert len(m.manifest.keys()) == 7
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test-file-1.bin'))
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test-file-2.bin'))
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'metadata/test-file-3.bin'))
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'metadata/test-file-4.bin'))
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'metadata/sub/test-file-5.bin'))

        with open(
                os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                             'test-file-1.bin'), 'rt') as tf:
            data = tf.read()
            assert data[0:4] == 'asdf'

        with open(
                os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                             'metadata/test-file-4.bin'), 'rt') as tf:
            data = tf.read()
            assert data[0:4] == '1234'
Example #4
0
    def pull_objects(self, dataset: Dataset, objects: List[PullObject],
                     progress_update_fn: Callable) -> PullResult:
        """High-level method to simply link files from the source dir to the object directory to the revision directory

        Args:
            dataset: The current dataset
            objects: A list of PullObjects the enumerate objects to push
            progress_update_fn: A callable with arg "completed_bytes" (int) indicating how many bytes have been
                                downloaded in since last called

        Returns:
            PullResult
        """
        client = self._get_client()
        bucket, prefix = self._get_s3_config()

        backend_config = dataset.client_config.config['datasets']['backends'][dataset.backend.storage_type]
        chunk_size = backend_config['download_chunk_size']
        success = list()
        failure = list()
        message = f"Downloaded {len(objects)} objects successfully."

        for obj in objects:
            # Get object
            response = client.get_object(Bucket=bucket,
                                         Key=os.path.join(prefix, obj.dataset_path))

            if response['ResponseMetadata']['HTTPStatusCode'] == 200:
                # Save file
                with open(obj.object_path, 'wb') as out_file:
                    for cnt, chunk in enumerate(response['Body'].iter_chunks(chunk_size=chunk_size)):
                        out_file.write(chunk)
                        progress_update_fn(len(chunk))

                success.append(obj)
            else:
                failure.append(obj)

        if len(failure) > 0:
            message = f"Downloaded {len(success)} objects successfully, but {len(failure)} failed. Check results."

        # link from object dir through to revision dir
        m = Manifest(dataset, self.configuration.get('username'))
        m.link_revision()

        return PullResult(success=success,
                          failure=failure,
                          message=message)
    def test_update_from_remote_backend_change(self, mock_config_class,
                                               mock_public_bucket):
        im = mock_config_class[0]
        ds = im.create_dataset(USERNAME,
                               USERNAME,
                               'dataset-1',
                               description="my dataset 1",
                               storage_type="public_s3_bucket")
        ds.backend.set_default_configuration(USERNAME, 'fakebearertoken',
                                             'fakeidtoken')

        assert ds.backend.can_update_from_remote() is True

        m = Manifest(ds, USERNAME)
        assert len(m.manifest.keys()) == 0

        # Configure backend completely
        current_config = ds.backend_config
        current_config['Bucket Name'] = mock_public_bucket
        current_config['Prefix'] = ""
        ds.backend_config = current_config

        # Trigger update
        ds.backend.update_from_remote(ds, updater)

        m = Manifest(ds, USERNAME)
        assert len(m.manifest.keys()) == 7

        modified_items = ds.backend.verify_contents(ds, updater)
        assert len(modified_items) == 0

        with tempfile.NamedTemporaryFile('wt') as tf:
            conn = boto3.resource('s3', region_name='us-east-1')
            tf.write("This file has been updated!")
            tf.seek(0)
            conn.meta.client.upload_file(tf.name, mock_public_bucket,
                                         'test-file-1.bin')

        ds.backend.update_from_remote(ds, updater)
        assert len(m.manifest.keys()) == 7

        modified_items = ds.backend.verify_contents(ds, updater)
        assert len(modified_items) == 0

        with open(
                os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                             'test-file-1.bin'), 'rt') as tf:
            assert tf.read() == "This file has been updated!"
Example #6
0
    def test_update_from_remote(self, mock_dataset_with_local_dir):
        ds = mock_dataset_with_local_dir[0]

        assert ds.backend.can_update_from_remote() is True

        m = Manifest(ds, 'tester')
        assert len(m.manifest.keys()) == 0

        ds.backend.update_from_remote(ds, updater)

        m = Manifest(ds, 'tester')
        assert len(m.manifest.keys()) == 4
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test1.txt'))
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test2.txt'))
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'subdir',
                         'test3.txt'))
Example #7
0
    def verify_contents(self, dataset, status_update_fn: Callable) -> List[str]:
        """Method to verify the hashes of all local files and indicate if they have changed

        Args:
            dataset: Dataset object
            status_update_fn: A callable, accepting a string for logging/providing status to the UI

        Returns:
            list
        """
        if 'username' not in self.configuration:
            raise ValueError("Dataset storage backend requires current logged in username to verify contents")

        m = Manifest(dataset, self.configuration.get('username'))
        keys_to_verify = list()
        for item in m.manifest:
            if os.path.isfile(os.path.join(m.cache_mgr.cache_root, m.dataset_revision, item)):
                # File exists locally
                keys_to_verify.append(item)

        # re-hash files
        status_update_fn(f"Validating contents of {len(keys_to_verify)} files. Please wait.")
        updated_hashes = self.hash_file_key_list(dataset, keys_to_verify)

        modified_items = list()
        for key, new_hash in zip(keys_to_verify, updated_hashes):
            item = m.manifest.get(key)
            if item:
                if new_hash != item.get('h'):
                    modified_items.append(key)

        if modified_items:
            status_update_fn(f"Integrity check complete. {len(modified_items)} files have been modified.")
        else:
            status_update_fn(f"Integrity check complete. No files have been modified.")

        return modified_items
Example #8
0
    def update_from_remote(self, dataset, status_update_fn: Callable) -> None:
        """Optional method that updates the dataset by comparing against the remote. Not all unmanaged dataset backends
        will be able to do this.

        Args:
            dataset: Dataset object
            status_update_fn: A callable, accepting a string for logging/providing status to the UI

        Returns:
            None
        """
        if 'username' not in self.configuration:
            raise ValueError("Dataset storage backend requires current logged in username to verify contents")
        m = Manifest(dataset, self.configuration.get('username'))

        # Walk remote checking etags with cached versions
        etag_data = self._load_etag_data(dataset)

        bucket, prefix = self._get_s3_config()
        client = self._get_client()

        paginator = client.get_paginator('list_objects_v2')
        response_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix)

        all_files = list()
        added_files = list()
        modified_files = list()
        print_cnt = 0

        revision_dir = os.path.join(m.cache_mgr.cache_root, m.dataset_revision)
        for x in response_iterator:
            if print_cnt == 0:
                status_update_fn("Processing Bucket Contents, please wait.")
                print_cnt += 1
            elif print_cnt == 1:
                status_update_fn("Processing Bucket Contents, please wait..")
                print_cnt += 1
            else:
                status_update_fn("Processing Bucket Contents, please wait...")
                print_cnt = 0

            for item in x.get("Contents"):
                key = item['Key']
                all_files.append(key)
                if key in m.manifest:
                    # Object already tracked
                    if etag_data[key] != item['ETag']:
                        # Object has been modified since last update
                        modified_files.append(key)
                        if os.path.exists(os.path.join(revision_dir, key)):
                            # Delete current version
                            os.remove(os.path.join(revision_dir, key))

                        if key[-1] == "/":
                            # is a "directory
                            os.makedirs(os.path.join(revision_dir, key), exist_ok=True)
                        else:
                            client.download_file(bucket, key, os.path.join(revision_dir, key))
                else:
                    # New Object
                    etag_data[key] = item['ETag']
                    added_files.append(key)

                    if key[-1] == "/":
                        # is a "directory
                        os.makedirs(os.path.join(revision_dir, key), exist_ok=True)
                    else:
                        os.makedirs(os.path.dirname(os.path.join(revision_dir, key)), exist_ok=True)
                        client.download_file(bucket, key, os.path.join(revision_dir, key))

        deleted_files = sorted(list(set(m.manifest.keys()).difference(all_files)))

        # Create StatusResult to force modifications
        status = StatusResult(created=added_files, modified=modified_files, deleted=deleted_files)

        self._save_etag_data(dataset, etag_data)

        # Run local update
        self.update_from_local(dataset, status_update_fn, status_result=status)
Example #9
0
    def update_from_remote(self, dataset, status_update_fn: Callable) -> None:
        """Optional method that updates the dataset by comparing against the remote. Not all unmanaged dataset backends
        will be able to do this.

        Args:
            dataset: Dataset object
            status_update_fn: A callable, accepting a string for logging/providing status to the UI

        Returns:
            None
        """
        if 'username' not in self.configuration:
            raise ValueError(
                "Dataset storage backend requires current logged in username to verify contents"
            )
        m = Manifest(dataset, self.configuration.get('username'))

        # walk the local source dir, looking for additions/deletions
        all_files = list()
        added_files = list()
        local_data_dir = self._get_local_data_dir()

        os.makedirs(os.path.join(m.cache_mgr.cache_root, m.dataset_revision),
                    exist_ok=True)

        for root, dirs, files in os.walk(local_data_dir):
            _, folder = root.split(local_data_dir)
            if len(folder) > 0:
                if folder[0] == os.path.sep:
                    folder = folder[1:]

            for d in dirs:
                # TODO: Check for ignored
                rel_path = os.path.join(
                    folder, d
                ) + os.path.sep  # All folders are represented with a trailing slash
                all_files.append(rel_path)
                if rel_path not in m.manifest:
                    added_files.append(rel_path)
                    # Create dir in current revision for linking to work
                    os.makedirs(os.path.join(m.cache_mgr.cache_root,
                                             m.dataset_revision, rel_path),
                                exist_ok=True)

            for file in files:
                # TODO: Check for ignored
                if file in ['.smarthash', '.DS_STORE', '.DS_Store']:
                    continue

                rel_path = os.path.join(folder, file)
                all_files.append(rel_path)
                if rel_path not in m.manifest:
                    added_files.append(rel_path)
                    # Symlink into current revision for downstream linking to work
                    logger.warning(os.path.join(root, file))
                    if not os.path.exists(
                            os.path.join(m.cache_mgr.cache_root,
                                         m.dataset_revision, rel_path)):
                        os.link(
                            os.path.join(root, file),
                            os.path.join(m.cache_mgr.cache_root,
                                         m.dataset_revision, rel_path))

                    # TODO: THINK ABOUT HERE....DO YOU NEED TO RUN THE MANIFEST LINKING HERE (but manifest not populated yet i'm pretty sure)

        deleted_files = sorted(
            list(set(m.manifest.keys()).difference(all_files)))

        # Create StatusResult to force modifications
        status = StatusResult(created=added_files,
                              modified=[],
                              deleted=deleted_files)

        # Link the revision dir
        m.link_revision()

        # Run local update
        self.update_from_local(dataset,
                               status_update_fn,
                               status_result=status,
                               verify_contents=True)
Example #10
0
    def test_pull(self, mock_dataset_with_local_dir):
        def chunk_update_callback(completed_bytes: int):
            """Method to update the job's metadata and provide feedback to the UI"""
            assert type(completed_bytes) == int
            assert completed_bytes > 0

        ds = mock_dataset_with_local_dir[0]
        m = Manifest(ds, 'tester')
        assert len(m.manifest.keys()) == 0
        ds.backend.update_from_remote(ds, updater)
        m = Manifest(ds, 'tester')

        # Remove revision dir
        shutil.rmtree(os.path.join(m.cache_mgr.cache_root, m.dataset_revision))

        keys = ['test1.txt', 'test2.txt', 'subdir/test3.txt']
        pull_objects = list()
        for key in keys:
            pull_objects.append(
                PullObject(object_path=m.dataset_to_object_path(key),
                           revision=m.dataset_revision,
                           dataset_path=key))
            # Remove objects
            os.remove(m.dataset_to_object_path(key))

        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test1.txt')) is False
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test2.txt')) is False
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'subdir',
                         'test3.txt')) is False

        for key in keys:
            assert os.path.isfile(m.dataset_to_object_path(key)) is False

        # Pull 1 File
        ds.backend.pull_objects(ds, [pull_objects[0]], chunk_update_callback)
        assert os.path.isdir(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision))
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test1.txt')) is True
        assert os.path.isfile(m.dataset_to_object_path('test1.txt')) is True

        # Pull all Files
        ds.backend.pull_objects(ds, pull_objects, chunk_update_callback)
        assert os.path.isdir(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision))
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test1.txt')) is True
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test2.txt')) is True
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'subdir',
                         'test3.txt')) is True
        for key in keys:
            assert os.path.isfile(m.dataset_to_object_path(key)) is True
    def test_pull(self, mock_config_class, mock_public_bucket):
        im = mock_config_class[0]
        ds = im.create_dataset(USERNAME,
                               USERNAME,
                               'dataset-1',
                               description="my dataset 1",
                               storage_type="public_s3_bucket")
        ds.backend.set_default_configuration(USERNAME, 'fakebearertoken',
                                             'fakeidtoken')

        # Configure backend completely
        current_config = ds.backend_config
        current_config['Bucket Name'] = mock_public_bucket
        current_config['Prefix'] = ""
        ds.backend_config = current_config

        ds.backend.update_from_remote(ds, updater)
        m = Manifest(ds, 'tester')

        # Remove revision dir and objects from cache
        shutil.rmtree(os.path.join(m.cache_mgr.cache_root, m.dataset_revision))

        keys = [
            'test-file-1.bin', 'metadata/test-file-3.bin',
            'metadata/sub/test-file-5.bin'
        ]
        pull_objects = list()
        for key in keys:
            pull_objects.append(
                PullObject(object_path=m.dataset_to_object_path(key),
                           revision=m.dataset_revision,
                           dataset_path=key))
            # Remove objects
            os.remove(m.dataset_to_object_path(key))

        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test-file-1.bin')) is False
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'metadata', 'test-file-3.bin')) is False
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'metadata', 'sub', 'test-file-5.bin')) is False

        for key in keys:
            assert os.path.isfile(m.dataset_to_object_path(key)) is False

        # Pull 1 File (duplicate contents so 2 files show up)
        ds.backend.pull_objects(ds, [pull_objects[0]], chunk_update_callback)
        assert os.path.isdir(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision))
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test-file-1.bin')) is True
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test-file-2.bin')) is True
        assert os.path.isfile(
            m.dataset_to_object_path('test-file-1.bin')) is True
        assert os.path.isfile(
            m.dataset_to_object_path('test-file-2.bin')) is True

        # Pull all Files
        ds.backend.pull_objects(ds, pull_objects, chunk_update_callback)
        assert os.path.isdir(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision))
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test-file-1.bin')) is True
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test-file-2.bin')) is True
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'metadata', 'test-file-3.bin')) is True
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'metadata', 'test-file-4.bin')) is True
        assert os.path.isfile(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'metadata', 'sub', 'test-file-5.bin')) is True
        for key in keys:
            assert os.path.isfile(m.dataset_to_object_path(key)) is True
Example #12
0
    def update_from_local(self, dataset, status_update_fn: Callable,
                          verify_contents: bool = False,
                          status_result: Optional[StatusResult] = None) -> None:
        """Method to update the dataset manifest for changed files that exists locally

        Args:
            dataset: Dataset object
            status_update_fn: A callable, accepting a string for logging/providing status to the UI
            verify_contents: Boolean indicating if "verify_contents" should be run, and the results added to modified
            status_result: Optional StatusResult object to include in the update (typically from update_from_remote())

        Returns:
            None
        """
        if 'username' not in self.configuration:
            raise ValueError("Dataset storage backend requires current logged in username to verify contents")
        m = Manifest(dataset, self.configuration.get('username'))

        status_update_fn("Updating Dataset manifest from local file state.")

        if status_result is not None:
            if status_result.modified is not None:
                modified_keys = copy.deepcopy(status_result.modified)
            else:
                modified_keys = list()
        else:
            modified_keys = list()

        if verify_contents:
            modified_keys.extend(self.verify_contents(dataset, status_update_fn))

        # Create StatusResult to force modifications
        if status_result:
            created_result = copy.deepcopy(status_result.created)
            # Check if any directories got created
            for key in status_result.created:
                if key[-1] != '/':
                    # a file
                    if os.path.dirname(key) not in m.manifest:
                        # Add the directory to the manifest
                        created_result.append(f"{os.path.dirname(key)}/")

            created_result = list(set(created_result))
            if '/' in created_result:
                created_result.remove('/')

            # Combine a previous StatusResult object (typically from "update_from_remote")
            status = StatusResult(created=created_result,
                                  modified=modified_keys,
                                  deleted=status_result.deleted)
        else:
            status = StatusResult(created=[], modified=modified_keys, deleted=[])

        # Update the manifest
        previous_revision = m.dataset_revision

        m.update(status)
        m.create_update_activity_record(status)

        # Link the revision dir
        m.link_revision()
        if os.path.isdir(os.path.join(m.cache_mgr.cache_root, previous_revision)):
            shutil.rmtree(os.path.join(m.cache_mgr.cache_root, previous_revision))

        status_update_fn("Update complete.")
Example #13
0
 def hash_file_key_list(self, dataset, keys):
     m = Manifest(dataset, self.configuration.get('username'))
     loop = get_event_loop()
     hash_task = asyncio.ensure_future(m.hasher.hash(keys))
     loop.run_until_complete(asyncio.gather(hash_task))
     return hash_task.result()