Example #1
0
    def update_metadata_and_save(self, bundle, enforce_disk_quota=False):
        """
        Updates the metadata about the contents of the bundle, including
        data_size as well as the total amount of disk used by the user.

        If |new_bundle| is True, saves the bundle as a new bundle. Otherwise,
        updates it.
        """
        bundle_path = self._bundle_store.get_bundle_location(bundle.uuid)

        dirs_and_files = None
        if os.path.isdir(bundle_path):
            dirs_and_files = path_util.recursive_ls(bundle_path)
        else:
            dirs_and_files = [], [bundle_path]

        data_hash = '0x%s' % (path_util.hash_directory(bundle_path,
                                                       dirs_and_files))
        data_size = path_util.get_size(bundle_path, dirs_and_files)
        if enforce_disk_quota:
            disk_left = self._bundle_model.get_user_disk_quota_left(
                bundle.owner_id)
            if data_size > disk_left:
                raise UsageError(
                    "Can't save bundle, bundle size %s greater than user's disk quota left: %s"
                    % (data_size, disk_left))

        bundle_update = {
            'data_hash': data_hash,
            'metadata': {
                'data_size': data_size,
            },
        }
        self._bundle_model.update_bundle(bundle, bundle_update)
        self._bundle_model.update_user_disk_used(bundle.owner_id)
Example #2
0
    def update_metadata_and_save(self, bundle, new_bundle):
        """
        Updates the metadata about the contents of the bundle, including
        data_size as well as the total amount of disk used by the user.

        If |new_bundle| is True, saves the bundle as a new bundle. Otherwise,
        updates it.
        """
        bundle_path = self._bundle_store.get_bundle_location(bundle.uuid)

        dirs_and_files = None
        if os.path.isdir(bundle_path):
            dirs_and_files = path_util.recursive_ls(bundle_path)
        else:
            dirs_and_files = [], [bundle_path]

        data_hash = '0x%s' % (path_util.hash_directory(bundle_path, dirs_and_files))
        data_size = path_util.get_size(bundle_path, dirs_and_files)

        if new_bundle:
            bundle.data_hash = data_hash
            bundle.metadata.set_metadata_key('data_size', data_size)
            self._bundle_model.save_bundle(bundle)
        else:
            bundle_update = {
               'data_hash': data_hash,
               'metadata': {
                    'data_size': data_size,             
                },
            }
            self._bundle_model.update_bundle(bundle, bundle_update)

        self._bundle_model.update_user_disk_used(bundle.owner_id)
Example #3
0
    def update_metadata_and_save(self, bundle, new_bundle):
        """
        Updates the metadata about the contents of the bundle, including
        data_size as well as the total amount of disk used by the user.

        If |new_bundle| is True, saves the bundle as a new bundle. Otherwise,
        updates it.
        """
        bundle_path = self._bundle_store.get_bundle_location(bundle.uuid)

        dirs_and_files = None
        if os.path.isdir(bundle_path):
            dirs_and_files = path_util.recursive_ls(bundle_path)
        else:
            dirs_and_files = [], [bundle_path]

        data_hash = '0x%s' % (path_util.hash_directory(bundle_path, dirs_and_files))
        data_size = path_util.get_size(bundle_path, dirs_and_files)

        if new_bundle:
            bundle.data_hash = data_hash
            bundle.metadata.set_metadata_key('data_size', data_size)
            self._bundle_model.save_bundle(bundle)
        else:
            bundle_update = {
               'data_hash': data_hash,
               'metadata': {
                    'data_size': data_size,             
                },
            }
            self._bundle_model.update_bundle(bundle, bundle_update)

        self._bundle_model.update_user_disk_used(bundle.owner_id)
    def test_hash_directory(self, mock_os):
        '''
    Test the two-level hashing scheme, mocking out all filesystem operations.
    '''
        tester = self
        directories = ['asdf', 'blah', 'this', 'is', 'not', 'sorted']
        files = ['foo', 'bar']
        relative_prefix = 'relative-'
        contents_hash_prefix = 'contents-hash-'

        # Compute the result of the the two-level hashing scheme on this bundle.
        directory_hash = hashlib.sha1()
        for directory in sorted(directories):
            path_hash = hashlib.sha1(
                (relative_prefix + directory).encode()).hexdigest()
            directory_hash.update(path_hash.encode())
        file_hash = hashlib.sha1()
        for file_name in sorted(files):
            name_hash = hashlib.sha1(
                (relative_prefix + file_name).encode()).hexdigest()
            file_hash.update(name_hash.encode())
            file_hash.update((contents_hash_prefix + file_name).encode())
        overall_hash = hashlib.sha1()
        overall_hash.update(directory_hash.hexdigest().encode())
        overall_hash.update(file_hash.hexdigest().encode())
        expected_hash = overall_hash.hexdigest()

        # Mock the recursive-listing and file-hashing operations in path_util.
        def mock_recursive_ls(path):
            tester.assertEqual(path, self.test_path)
            return (directories, files)

        def mock_get_relative_path(root, path):
            tester.assertEqual(root, self.test_path)
            tester.assertIn(path, directories + files)
            return relative_prefix + path

        def mock_hash_file_contents(path):
            tester.assertIn(path, files)
            return contents_hash_prefix + path

        with mock.patch('codalab.lib.path_util.recursive_ls',
                        mock_recursive_ls):
            with mock.patch('codalab.lib.path_util.get_relative_path',
                            mock_get_relative_path):
                with mock.patch('codalab.lib.path_util.hash_file_contents',
                                mock_hash_file_contents):
                    actual_hash = path_util.hash_directory(self.test_path)
        self.assertEqual(actual_hash, expected_hash)
Example #5
0
    def test_hash_directory(self, mock_os):
        '''
    Test the two-level hashing scheme, mocking out all filesystem operations.
    '''
        tester = self
        directories = ['asdf', 'blah', 'this', 'is', 'not', 'sorted']
        files = ['foo', 'bar']
        relative_prefix = 'relative-'
        contents_hash_prefix = 'contents-hash-'

        # Compute the result of the the two-level hashing scheme on this bundle.
        directory_hash = hashlib.sha1()
        for directory in sorted(directories):
            path_hash = hashlib.sha1(relative_prefix + directory).hexdigest()
            directory_hash.update(path_hash)
        file_hash = hashlib.sha1()
        for file_name in sorted(files):
            name_hash = hashlib.sha1(relative_prefix + file_name).hexdigest()
            file_hash.update(name_hash)
            file_hash.update(contents_hash_prefix + file_name)
        overall_hash = hashlib.sha1()
        overall_hash.update(directory_hash.hexdigest())
        overall_hash.update(file_hash.hexdigest())
        expected_hash = overall_hash.hexdigest()

        # Mock the recursive-listing and file-hashing operations in path_util.
        def mock_recursive_ls(path):
            tester.assertEqual(path, self.test_path)
            return (directories, files)

        def mock_get_relative_path(root, path):
            tester.assertEqual(root, self.test_path)
            tester.assertIn(path, directories + files)
            return relative_prefix + path

        def mock_hash_file_contents(path):
            tester.assertIn(path, files)
            return contents_hash_prefix + path

        with mock.patch('codalab.lib.path_util.recursive_ls', mock_recursive_ls):
            with mock.patch('codalab.lib.path_util.get_relative_path', mock_get_relative_path):
                with mock.patch(
                    'codalab.lib.path_util.hash_file_contents', mock_hash_file_contents
                ):
                    actual_hash = path_util.hash_directory(self.test_path)
        self.assertEqual(actual_hash, expected_hash)
Example #6
0
    def upload(self, path, allow_symlinks=False):
        '''
        Copy the contents of the directory at path into the data subdirectory,
        in a subfolder named by a hash of the contents of the new data directory.

        Return a (data_hash, metadata) pair, where the metadata is a dict mapping
        keys to precomputed statistics about the new data directory.
        '''
        absolute_path = path_util.normalize(path)
        path_util.check_isvalid(absolute_path, 'upload')
        # Recursively copy the directory into a new BundleStore temp directory.
        temp_directory = uuid.uuid4().hex
        temp_path = os.path.join(self.temp, temp_directory)
        path_util.copy(absolute_path, temp_path)
        # Multiplex between uploading a directory and uploading a file here.
        # All other path_util calls will use these lists of directories and files.
        if os.path.isdir(temp_path):
            dirs_and_files = path_util.recursive_ls(temp_path)
        else:
            dirs_and_files = ([], [temp_path])
        if not allow_symlinks:
            path_util.check_for_symlinks(temp_path, dirs_and_files)
        path_util.set_permissions(temp_path, 0o755, dirs_and_files)
        # Hash the contents of the temporary directory, and then if there is no
        # data with this hash value, move this directory into the data directory.
        data_hash = '0x%s' % (path_util.hash_directory(temp_path, dirs_and_files),)
        data_size = path_util.get_size(temp_path, dirs_and_files)
        final_path = os.path.join(self.data, data_hash)
        final_path_exists = False
        try:
            os.utime(final_path, None)
            final_path_exists = True
        except OSError, e:
            if e.errno == errno.ENOENT:
                os.rename(temp_path, final_path)
            else:
                raise
Example #7
0
    def upload(self, sources, follow_symlinks, exclude_patterns, git, unpack, remove_sources):
        '''
        |sources|: specifies the locations of the contents to upload.  Each element is either a URL or a local path.
        |follow_symlinks|: for local path(s), whether to follow (resolve) symlinks
        |exclude_patterns|: for local path(s), don't upload these patterns (e.g., *.o)
        |git|: for URL, whether |source| is a git repo to clone.
        |unpack|: for each source in |sources|, whether to unpack it if it's an archive.
        |remove_sources|: remove |sources|.

        If |sources| contains one source, then the bundle contents will be that source.
        Otherwise, the bundle contents will be a directory with each of the sources.
        Exceptions:
        - If |git|, then each source is replaced with the result of running 'git clone |source|'
        - If |unpack| is True or a source is an archive (zip, tar.gz, etc.), then unpack the source.

        Install the contents of the directory at |source| into
        DATA_SUBDIRECTORY in a subdirectory named by a hash of the contents.

        Return a (data_hash, metadata) pair, where the metadata is a dict mapping
        keys to precomputed statistics about the new data directory.
        '''
        to_delete = []

        # Create temporary directory as a staging area and put everything there.
        temp_path = tempfile.mkdtemp('-bundle_store_upload')
        temp_subpaths = []
        for source in sources:
            # Where to save |source| to (might change this value if we unpack).
            temp_subpath = os.path.join(temp_path, os.path.basename(source))
            if remove_sources:
                to_delete.append(source)
            source_unpack = unpack and zip_util.path_is_archive(source)

            if path_util.path_is_url(source):
                # Download the URL.
                print_util.open_line('BundleStore.upload: downloading %s to %s' % (source, temp_path))
                if git:
                    file_util.git_clone(source, temp_subpath)
                else:
                    file_util.download_url(source, temp_subpath, print_status=True)
                    if source_unpack:
                        zip_util.unpack(temp_subpath, zip_util.strip_archive_ext(temp_subpath))
                        path_util.remove(temp_subpath)
                        temp_subpath = zip_util.strip_archive_ext(temp_subpath)
                print_util.clear_line()
            else:
                # Copy the local path.
                source_path = path_util.normalize(source)
                path_util.check_isvalid(source_path, 'upload')

                # Recursively copy the directory into a new BundleStore temp directory.
                print_util.open_line('BundleStore.upload: %s => %s' % (source_path, temp_subpath))
                if source_unpack:
                    zip_util.unpack(source_path, zip_util.strip_archive_ext(temp_subpath))
                    temp_subpath = zip_util.strip_archive_ext(temp_subpath)
                else:
                    if remove_sources:
                        path_util.rename(source_path, temp_subpath)
                    else:
                        path_util.copy(source_path, temp_subpath, follow_symlinks=follow_symlinks, exclude_patterns=exclude_patterns)
                print_util.clear_line()

            temp_subpaths.append(temp_subpath)

        # If exactly one source, then upload that directly.
        if len(temp_subpaths) == 1:
            to_delete.append(temp_path)
            temp_path = temp_subpaths[0]

        # Multiplex between uploading a directory and uploading a file here.
        # All other path_util calls will use these lists of directories and files.
        if os.path.isdir(temp_path):
            dirs_and_files = path_util.recursive_ls(temp_path)
        else:
            dirs_and_files = ([], [temp_path])

        # Hash the contents of the temporary directory, and then if there is no
        # data with this hash value, move this directory into the data directory.
        print_util.open_line('BundleStore.upload: hashing %s' % temp_path)
        data_hash = '0x%s' % (path_util.hash_directory(temp_path, dirs_and_files),)
        print_util.clear_line()
        print_util.open_line('BundleStore.upload: computing size of %s' % temp_path)
        data_size = path_util.get_size(temp_path, dirs_and_files)
        print_util.clear_line()
        final_path = os.path.join(self.data, data_hash)
        if os.path.exists(final_path):
            # Already exists, just delete it
            path_util.remove(temp_path)
        else:
            print >>sys.stderr, 'BundleStore.upload: moving %s to %s' % (temp_path, final_path)
            path_util.rename(temp_path, final_path)

        # Delete paths.
        for path in to_delete:
            if os.path.exists(path):
                path_util.remove(path)

        # After this operation there should always be a directory at the final path.
        assert(os.path.lexists(final_path)), 'Uploaded to %s failed!' % (final_path,)
        return (data_hash, {'data_size': data_size})
Example #8
0
    def health_check(self,
                     model,
                     force=False,
                     compute_data_hash=False,
                     repair_hashes=False):
        """
        MultiDiskBundleStore.health_check(): In the MultiDiskBundleStore, bundle contents are stored on disk, and
        occasionally the disk gets out of sync with the database, in which case we make repairs in the following ways:

            1. Deletes bundles with corresponding UUID not in the database.
            3. Deletes any files not beginning with UUID string.
            4. For each bundle marked READY or FAILED, ensure that its dependencies are not located in the bundle
               directory. If they are then delete the dependencies.
            5. For bundle <UUID> marked READY or FAILED, <UUID>.cid or <UUID>.status, or the <UUID>(-internal).sh files
               should not exist.
        |force|: Perform any destructive operations on the bundle store the health check determines are necessary. False by default
        |compute_data_hash|: If True, compute the data_hash for every single bundle ourselves and see if it's consistent with what's in
                             the database. False by default.
        """
        UUID_REGEX = re.compile(r'^(%s)' % spec_util.UUID_STR)

        def _delete_path(loc):
            cmd = 'rm -r \'%s\'' % loc
            print(cmd)
            if force:
                path_util.remove(loc)

        def _get_uuid(path):
            fname = os.path.basename(path)
            try:
                return UUID_REGEX.match(fname).groups()[0]
            except:
                return None

        def _is_bundle(path):
            """Returns whether the given path is a bundle directory/file"""
            return _get_uuid(path) == os.path.basename(path)

        def _check_bundle_paths(bundle_paths, db_bundle_by_uuid):
            """
            Takes in a list of bundle paths and a mapping of UUID to BundleModel, and returns a list of paths and
            subpaths that need to be removed.
            """
            to_delete = []
            # Batch get information for all bundles stored on-disk

            for bundle_path in bundle_paths:
                uuid = _get_uuid(bundle_path)
                # Screen for bundles stored on disk that are no longer in the database
                bundle = db_bundle_by_uuid.get(uuid, None)
                if bundle == None:
                    to_delete += [bundle_path]
                    continue
                # Delete dependencies stored inside of READY or FAILED bundles
                if bundle.state in [State.READY, State.FAILED]:
                    dep_paths = [
                        os.path.join(bundle_path, dep.child_path)
                        for dep in bundle.dependencies
                    ]
                    to_delete += list(filter(os.path.exists, dep_paths))
            return to_delete

        def _check_other_paths(other_paths, db_bundle_by_uuid):
            """
            Given a list of non-bundle paths, and a mapping of UUID to BundleModel, returns a list of paths to delete.
            """
            to_delete = []
            for path in other_paths:
                uuid = _get_uuid(path)
                bundle = db_bundle_by_uuid.get(uuid, None)
                if bundle == None:
                    to_delete += [path]
                    continue
                ends_with_ext = (path.endswith('.cid')
                                 or path.endswith('.status')
                                 or path.endswith('.sh'))
                if bundle.state in [State.READY, State.FAILED]:
                    if ends_with_ext:
                        to_delete += [path]
                        continue
                    elif '.' in path:
                        print('WARNING: File %s is likely junk.' % path,
                              file=sys.stderr)
            return to_delete

        partitions, _ = path_util.ls(self.partitions)
        trash_count = 0

        for partition in partitions:
            print('Looking for trash in partition %s...' % partition,
                  file=sys.stderr)
            partition_path = os.path.join(
                self.partitions, partition,
                MultiDiskBundleStore.DATA_SUBDIRECTORY)
            entries = list(
                map(
                    lambda f: os.path.join(partition_path, f),
                    reduce(lambda d, f: d + f, path_util.ls(partition_path)),
                ))
            bundle_paths = list(filter(_is_bundle, entries))
            other_paths = set(entries) - set(bundle_paths)

            uuids = list(map(_get_uuid, bundle_paths))
            db_bundles = model.batch_get_bundles(uuid=uuids)
            db_bundle_by_uuid = dict()
            for bundle in db_bundles:
                db_bundle_by_uuid[bundle.uuid] = bundle

            # Check both bundles and non-bundles and remove each
            for to_delete in _check_bundle_paths(bundle_paths,
                                                 db_bundle_by_uuid):
                trash_count += 1
                _delete_path(to_delete)
            for to_delete in _check_other_paths(other_paths,
                                                db_bundle_by_uuid):
                trash_count += 1
                _delete_path(to_delete)

            # Check for each bundle if we need to compute its data_hash
            data_hash_recomputed = 0

            print('Checking data_hash of bundles in partition %s...' %
                  partition,
                  file=sys.stderr)
            for bundle_path in bundle_paths:
                uuid = _get_uuid(bundle_path)
                bundle = db_bundle_by_uuid.get(uuid, None)
                if bundle == None:
                    continue
                if compute_data_hash or bundle.data_hash == None:
                    dirs_and_files = (path_util.recursive_ls(bundle_path)
                                      if os.path.isdir(bundle_path) else
                                      ([], [bundle_path]))
                    data_hash = '0x%s' % path_util.hash_directory(
                        bundle_path, dirs_and_files)
                    if bundle.data_hash == None:
                        data_hash_recomputed += 1
                        print(
                            'Giving bundle %s data_hash %s' %
                            (bundle_path, data_hash),
                            file=sys.stderr,
                        )
                        if force:
                            db_update = dict(data_hash=data_hash)
                            model.update_bundle(bundle, db_update)
                    elif compute_data_hash and data_hash != bundle.data_hash:
                        data_hash_recomputed += 1
                        print(
                            'Bundle %s should have data_hash %s, actual digest is %s'
                            % (bundle_path, bundle.data_hash, data_hash),
                            file=sys.stderr,
                        )
                        if repair_hashes and force:
                            db_update = dict(data_hash=data_hash)
                            model.update_bundle(bundle, db_update)

        if force:
            print('\tDeleted %d objects from the bundle store' % trash_count,
                  file=sys.stderr)
            print('\tRecomputed data_hash for %d bundles' %
                  data_hash_recomputed,
                  file=sys.stderr)
        else:
            print(
                'Dry-Run Statistics, re-run with --force to perform updates:',
                file=sys.stderr)
            print('\tObjects marked for deletion: %d' % trash_count,
                  file=sys.stderr)
            print(
                '\tBundles that need data_hash recompute: %d' %
                data_hash_recomputed,
                file=sys.stderr,
            )
Example #9
0
    def health_check(self, model, force=False, compute_data_hash=False, repair_hashes=False):
        """
        MultiDiskBundleStore.health_check(): In the MultiDiskBundleStore, bundle contents are stored on disk, and
        occasionally the disk gets out of sync with the database, in which case we make repairs in the following ways:

            1. Deletes bundles with corresponding UUID not in the database.
            3. Deletes any files not beginning with UUID string.
            4. For each bundle marked READY or FAILED, ensure that its dependencies are not located in the bundle
               directory. If they are then delete the dependencies.
            5. For bundle <UUID> marked READY or FAILED, <UUID>.cid or <UUID>.status, or the <UUID>(-internal).sh files
               should not exist.
        |force|: Perform any destructive operations on the bundle store the health check determines are necessary. False by default
        |compute_data_hash|: If True, compute the data_hash for every single bundle ourselves and see if it's consistent with what's in
                             the database. False by default.
        """
        UUID_REGEX = re.compile(r'^(%s)' % spec_util.UUID_STR)

        def _delete_path(loc):
            cmd = 'rm -r \'%s\'' % loc
            print(cmd)
            if force:
                path_util.remove(loc)

        def _get_uuid(path):
            fname = os.path.basename(path)
            try:
                return UUID_REGEX.match(fname).groups()[0]
            except:
                return None

        def _is_bundle(path):
            """Returns whether the given path is a bundle directory/file"""
            return _get_uuid(path) == os.path.basename(path)

        def _check_bundle_paths(bundle_paths, db_bundle_by_uuid):
            """
            Takes in a list of bundle paths and a mapping of UUID to BundleModel, and returns a list of paths and
            subpaths that need to be removed.
            """
            to_delete = []
            # Batch get information for all bundles stored on-disk

            for bundle_path in bundle_paths:
                uuid = _get_uuid(bundle_path)
                # Screen for bundles stored on disk that are no longer in the database
                bundle = db_bundle_by_uuid.get(uuid, None)
                if bundle == None:
                    to_delete += [bundle_path]
                    continue
                # Delete dependencies stored inside of READY or FAILED bundles
                if bundle.state in [State.READY, State.FAILED]:
                    dep_paths = [
                        os.path.join(bundle_path, dep.child_path) for dep in bundle.dependencies
                    ]
                    to_delete += filter(os.path.exists, dep_paths)
            return to_delete

        def _check_other_paths(other_paths, db_bundle_by_uuid):
            """
            Given a list of non-bundle paths, and a mapping of UUID to BundleModel, returns a list of paths to delete.
            """
            to_delete = []
            for path in other_paths:
                uuid = _get_uuid(path)
                bundle = db_bundle_by_uuid.get(uuid, None)
                if bundle == None:
                    to_delete += [path]
                    continue
                ends_with_ext = (
                    path.endswith('.cid') or path.endswith('.status') or path.endswith('.sh')
                )
                if bundle.state in [State.READY, State.FAILED]:
                    if ends_with_ext:
                        to_delete += [path]
                        continue
                    elif '.' in path:
                        print >>sys.stderr, 'WARNING: File %s is likely junk.' % path
            return to_delete

        partitions, _ = path_util.ls(self.partitions)
        trash_count = 0

        for partition in partitions:
            print >>sys.stderr, 'Looking for trash in partition %s...' % partition
            partition_path = os.path.join(
                self.partitions, partition, MultiDiskBundleStore.DATA_SUBDIRECTORY
            )
            entries = map(
                lambda f: os.path.join(partition_path, f),
                reduce(lambda d, f: d + f, path_util.ls(partition_path)),
            )
            bundle_paths = filter(_is_bundle, entries)
            other_paths = set(entries) - set(bundle_paths)

            uuids = map(_get_uuid, bundle_paths)
            db_bundles = model.batch_get_bundles(uuid=uuids)
            db_bundle_by_uuid = dict()
            for bundle in db_bundles:
                db_bundle_by_uuid[bundle.uuid] = bundle

            # Check both bundles and non-bundles and remove each
            for to_delete in _check_bundle_paths(bundle_paths, db_bundle_by_uuid):
                trash_count += 1
                _delete_path(to_delete)
            for to_delete in _check_other_paths(other_paths, db_bundle_by_uuid):
                trash_count += 1
                _delete_path(to_delete)

            # Check for each bundle if we need to compute its data_hash
            data_hash_recomputed = 0

            print >>sys.stderr, 'Checking data_hash of bundles in partition %s...' % partition
            for bundle_path in bundle_paths:
                uuid = _get_uuid(bundle_path)
                bundle = db_bundle_by_uuid.get(uuid, None)
                if bundle == None:
                    continue
                if compute_data_hash or bundle.data_hash == None:
                    dirs_and_files = (
                        path_util.recursive_ls(bundle_path)
                        if os.path.isdir(bundle_path)
                        else ([], [bundle_path])
                    )
                    data_hash = '0x%s' % path_util.hash_directory(bundle_path, dirs_and_files)
                    if bundle.data_hash == None:
                        data_hash_recomputed += 1
                        print >>sys.stderr, 'Giving bundle %s data_hash %s' % (
                            bundle_path,
                            data_hash,
                        )
                        if force:
                            db_update = dict(data_hash=data_hash)
                            model.update_bundle(bundle, db_update)
                    elif compute_data_hash and data_hash != bundle.data_hash:
                        data_hash_recomputed += 1
                        print >>sys.stderr, 'Bundle %s should have data_hash %s, actual digest is %s' % (
                            bundle_path,
                            bundle.data_hash,
                            data_hash,
                        )
                        if repair_hashes and force:
                            db_update = dict(data_hash=data_hash)
                            model.update_bundle(bundle, db_update)

        if force:
            print >>sys.stderr, '\tDeleted %d objects from the bundle store' % trash_count
            print >>sys.stderr, '\tRecomputed data_hash for %d bundles' % data_hash_recomputed
        else:
            print >>sys.stderr, 'Dry-Run Statistics, re-run with --force to perform updates:'
            print >>sys.stderr, '\tObjects marked for deletion: %d' % trash_count
            print >>sys.stderr, '\tBundles that need data_hash recompute: %d' % data_hash_recomputed
Example #10
0
    def upload(self, sources, follow_symlinks, exclude_patterns, git, unpack, remove_sources, uuid):
        """
        |sources|: specifies the locations of the contents to upload.  Each element is either a URL or a local path.
        |follow_symlinks|: for local path(s), whether to follow (resolve) symlinks
        |exclude_patterns|: for local path(s), don't upload these patterns (e.g., *.o)
        |git|: for URL, whether |source| is a git repo to clone.
        |unpack|: for each source in |sources|, whether to unpack it if it's an archive.
        |remove_sources|: remove |sources|.

        If |sources| contains one source, then the bundle contents will be that source.
        Otherwise, the bundle contents will be a directory with each of the sources.
        Exceptions:
        - If |git|, then each source is replaced with the result of running 'git clone |source|'
        - If |unpack| is True or a source is an archive (zip, tar.gz, etc.), then unpack the source.

        Install the contents of the directory at |source| into
        DATA_SUBDIRECTORY in a subdirectory named by a hash of the contents.

        Return a (data_hash, metadata) pair, where the metadata is a dict mapping
        keys to precomputed statistics about the new data directory.
        """
        to_delete = []

        # If just a single file, set the final path to be equal to that file
        single_path = len(sources) == 1

        # Determine which disk this will go on
        disk_choice = self.ring.get_node(uuid)

        final_path = os.path.join(self.partitions, disk_choice, self.DATA_SUBDIRECTORY, uuid)
        if os.path.exists(final_path):
            raise UsageError('Path %s already present in bundle store' % final_path)
        # Only make if not there
        elif not single_path:
            path_util.make_directory(final_path)

        # Paths to resources
        subpaths = []

        for source in sources:
            # Where to save |source| to (might change this value if we unpack).
            if not single_path:
                subpath = os.path.join(final_path, os.path.basename(source))
            else:
                subpath = final_path

            if remove_sources:
                to_delete.append(source)
            source_unpack = unpack and zip_util.path_is_archive(source)

            if source_unpack and single_path:
                # Load the file into the bundle store under the given path
                subpath += zip_util.get_archive_ext(source)

            if path_util.path_is_url(source):
                # Download the URL.
                print_util.open_line('BundleStore.upload: downloading %s to %s' % (source, subpath))
                if git:
                    file_util.git_clone(source, subpath)
                else:
                    file_util.download_url(source, subpath, print_status=True)
                    if source_unpack:
                        zip_util.unpack(subpath, zip_util.strip_archive_ext(subpath))
                        path_util.remove(subpath)
                        subpath = zip_util.strip_archive_ext(subpath)
                print_util.clear_line()
            else:
                # Copy the local path.
                source_path = path_util.normalize(source)
                path_util.check_isvalid(source_path, 'upload')

                # Recursively copy the directory into the BundleStore
                print_util.open_line('BundleStore.upload: %s => %s' % (source_path, subpath))
                if source_unpack:
                    zip_util.unpack(source_path, zip_util.strip_archive_ext(subpath))
                    subpath = zip_util.strip_archive_ext(subpath)
                else:
                    if remove_sources:
                        path_util.rename(source_path, subpath)
                    else:
                        path_util.copy(source_path, subpath, follow_symlinks=follow_symlinks, exclude_patterns=exclude_patterns)
                print_util.clear_line()

            subpaths.append(subpath)

        dirs_and_files = None
        if os.path.isdir(final_path):
            dirs_and_files = path_util.recursive_ls(final_path)
        else:
            dirs_and_files = [], [final_path]

        # Hash the contents of the bundle directory. Update the data_hash attribute
        # for the bundle
        print_util.open_line('BundleStore.upload: hashing %s' % final_path)
        data_hash = '0x%s' % (path_util.hash_directory(final_path, dirs_and_files))
        print_util.clear_line()
        print_util.open_line('BundleStore.upload: computing size of %s' % final_path)
        data_size = path_util.get_size(final_path, dirs_and_files)
        print_util.clear_line()

        # Delete paths.
        for path in to_delete:
            if os.path.exists(path):
                path_util.remove(path)

        # After this operation there should always be a directory at the final path.
        assert (os.path.lexists(final_path)), 'Uploaded to %s failed!' % (final_path,)
        return (data_hash, {'data_size': data_size})
Example #11
0
    def upload(self, path, follow_symlinks, exclude_patterns):
        '''
        Copy the contents of the directory at |path| into the data subdirectory,
        in a subfolder named by a hash of the contents of the new data directory.
        If |path| is in a temporary directory, then we just move it.

        Return a (data_hash, metadata) pair, where the metadata is a dict mapping
        keys to precomputed statistics about the new data directory.
        '''
        # Create temporary directory as a staging area.
        # If |path| is already temporary, then we use that directly
        # (with the understanding that |path| will be moved)
        if not isinstance(path, list) and os.path.realpath(path).startswith(
                os.path.realpath(self.temp)):
            temp_path = path
        else:
            temp_path = os.path.join(self.temp, uuid.uuid4().hex)

        if not isinstance(path, list) and path_util.path_is_url(path):
            # Have to be careful.  Want to make sure if we're fetching a URL
            # that points to a file, we are allowing this.
            if path.startswith('file://'):
                path_suffix = path[7:]
                if os.path.islink(path_suffix):
                    raise UsageError('Not allowed to upload symlink %s' %
                                     path_suffix)
                if not any(
                        path_suffix.startswith(f)
                        for f in self.direct_upload_paths):
                    raise UsageError(
                        'Not allowed to upload %s (only %s allowed)' %
                        (path_suffix, self.direct_upload_paths))

            # Download |path| if it is a URL.
            print >> sys.stderr, 'BundleStore.upload: downloading %s to %s' % (
                path, temp_path)
            file_util.download_url(path, temp_path, print_status=True)
        elif path != temp_path:
            # Copy |path| into the temp_path.
            if isinstance(path, list):
                absolute_path = [path_util.normalize(p) for p in path]
                for p in absolute_path:
                    path_util.check_isvalid(p, 'upload')
            else:
                absolute_path = path_util.normalize(path)
                path_util.check_isvalid(absolute_path, 'upload')

            # Recursively copy the directory into a new BundleStore temp directory.
            print_util.open_line('BundleStore.upload: copying %s to %s' %
                                 (absolute_path, temp_path))
            path_util.copy(absolute_path,
                           temp_path,
                           follow_symlinks=follow_symlinks,
                           exclude_patterns=exclude_patterns)
            print_util.clear_line()

        # Multiplex between uploading a directory and uploading a file here.
        # All other path_util calls will use these lists of directories and files.
        if os.path.isdir(temp_path):
            dirs_and_files = path_util.recursive_ls(temp_path)
        else:
            dirs_and_files = ([], [temp_path])

        # Hash the contents of the temporary directory, and then if there is no
        # data with this hash value, move this directory into the data directory.
        print_util.open_line('BundleStore.upload: hashing %s' % temp_path)
        data_hash = '0x%s' % (path_util.hash_directory(temp_path,
                                                       dirs_and_files), )
        print_util.clear_line()
        print_util.open_line('BundleStore.upload: computing size of %s' %
                             temp_path)
        data_size = path_util.get_size(temp_path, dirs_and_files)
        print_util.clear_line()
        final_path = os.path.join(self.data, data_hash)
        final_path_exists = False
        try:
            # If data_hash already exists, then we don't need to move it over.
            os.utime(final_path, None)
            final_path_exists = True
        except OSError, e:
            if e.errno == errno.ENOENT:
                print >> sys.stderr, 'BundleStore.upload: moving %s to %s' % (
                    temp_path, final_path)
                path_util.rename(temp_path, final_path)
            else:
                raise
Example #12
0
    def upload(self, path, follow_symlinks):
        """
        Copy the contents of the directory at |path| into the data subdirectory,
        in a subfolder named by a hash of the contents of the new data directory.
        If |path| is in a temporary directory, then we just move it.

        Return a (data_hash, metadata) pair, where the metadata is a dict mapping
        keys to precomputed statistics about the new data directory.
        """
        # Create temporary directory as a staging area.
        # If |path| is already temporary, then we use that directly
        # (with the understanding that |path| will be moved)
        if not isinstance(path, list) and os.path.realpath(path).startswith(os.path.realpath(self.temp)):
            temp_path = path
        else:
            temp_path = os.path.join(self.temp, uuid.uuid4().hex)

        if not isinstance(path, list) and path_util.path_is_url(path):
            # Have to be careful.  Want to make sure if we're fetching a URL
            # that points to a file, we are allowing this.
            if path.startswith("file://"):
                path_suffix = path[7:]
                if os.path.islink(path_suffix):
                    raise UsageError("Not allowed to upload symlink %s" % path_suffix)
                if not any(path_suffix.startswith(f) for f in self.direct_upload_paths):
                    raise UsageError(
                        "Not allowed to upload %s (only %s allowed)" % (path_suffix, self.direct_upload_paths)
                    )

            # Download |path| if it is a URL.
            print >>sys.stderr, "BundleStore.upload: downloading %s to %s" % (path, temp_path)
            file_util.download_url(path, temp_path, print_status=True)
        elif path != temp_path:
            # Copy |path| into the temp_path.
            if isinstance(path, list):
                absolute_path = [path_util.normalize(p) for p in path]
                for p in absolute_path:
                    path_util.check_isvalid(p, "upload")
            else:
                absolute_path = path_util.normalize(path)
                path_util.check_isvalid(absolute_path, "upload")

            # Recursively copy the directory into a new BundleStore temp directory.
            print >>sys.stderr, "BundleStore.upload: copying %s to %s" % (absolute_path, temp_path)
            path_util.copy(absolute_path, temp_path, follow_symlinks=follow_symlinks)

        # Multiplex between uploading a directory and uploading a file here.
        # All other path_util calls will use these lists of directories and files.
        if os.path.isdir(temp_path):
            dirs_and_files = path_util.recursive_ls(temp_path)
        else:
            dirs_and_files = ([], [temp_path])

        # Hash the contents of the temporary directory, and then if there is no
        # data with this hash value, move this directory into the data directory.
        print >>sys.stderr, "BundleStore.upload: hashing %s" % (temp_path)
        data_hash = "0x%s" % (path_util.hash_directory(temp_path, dirs_and_files),)
        data_size = path_util.get_size(temp_path, dirs_and_files)
        final_path = os.path.join(self.data, data_hash)
        final_path_exists = False
        try:
            # If data_hash already exists, then we don't need to move it over.
            os.utime(final_path, None)
            final_path_exists = True
        except OSError, e:
            if e.errno == errno.ENOENT:
                print >>sys.stderr, "BundleStore.upload: moving %s to %s" % (temp_path, final_path)
                path_util.rename(temp_path, final_path)
            else:
                raise