Python chunk_generatorの例、titan.common.utils.chunk_generator Pythonの例

コード例 #1

0

ファイルを表示

ファイル: utils_test.py プロジェクト: GoogleCloudPlatform/titan

  def testChunkGenerator(self):
    nums = range(0, 2501)

    # Verify default chunk size.
    new_nums = []
    for i, some_nums in enumerate(utils.chunk_generator(nums)):
      new_nums += some_nums
    # Should have processed in 3 chunks (0, 1, 2):
    self.assertEqual(2, i)
    self.assertListEqual(new_nums, nums)

    # Verify chunk size bigger than input.
    new_nums = []
    for i, some_nums in enumerate(utils.chunk_generator(nums, chunk_size=5000)):
      new_nums += some_nums
    # Should have processed in 1 chunk:
    self.assertEqual(0, i)
    self.assertListEqual(new_nums, nums)

コード例 #2

0

ファイルを表示

    def testChunkGenerator(self):
        nums = range(0, 2501)

        # Verify default chunk size.
        new_nums = []
        for i, some_nums in enumerate(utils.chunk_generator(nums)):
            new_nums += some_nums
        # Should have processed in 3 chunks (0, 1, 2):
        self.assertEqual(2, i)
        self.assertListEqual(new_nums, nums)

        # Verify chunk size bigger than input.
        new_nums = []
        for i, some_nums in enumerate(
                utils.chunk_generator(nums, chunk_size=5000)):
            new_nums += some_nums
        # Should have processed in 1 chunk:
        self.assertEqual(0, i)
        self.assertListEqual(new_nums, nums)

コード例 #3

0

ファイルを表示

ファイル: dirs.py プロジェクト: dindinet/titan

    def process_next_window(self):
        """Lease one window-worth of tasks and update the corresponding dirs.

    Returns:
      A list of ModifiedPaths.
    """
        queue = taskqueue.Queue(TASKQUEUE_NAME)
        # Don't specify a tag; this pulls the oldest tasks of the same tag.
        tasks = queue.lease_tasks_by_tag(lease_seconds=TASKQUEUE_LEASE_SECONDS,
                                         max_tasks=TASKQUEUE_LEASE_MAX_TASKS)
        if not tasks:
            return {}

        # Keep leasing similar tasks if we hit the per-request leasing max.
        have_all_tasks = True if len(
            tasks) < TASKQUEUE_LEASE_MAX_TASKS else False
        while not have_all_tasks:
            tasks_in_window = queue.lease_tasks_by_tag(
                lease_seconds=TASKQUEUE_LEASE_SECONDS,
                max_tasks=TASKQUEUE_LEASE_MAX_TASKS,
                tag=tasks[0].tag)
            tasks.extend(tasks_in_window)
            if len(tasks_in_window) < TASKQUEUE_LEASE_MAX_TASKS:
                have_all_tasks = True

        # Package each task's data into a ModifiedPath and pass it on.
        # Don't deal with ordering or chronologically collapsing paths here.
        modified_paths = []
        for task in tasks:
            path_data = json.loads(task.payload)
            modified_path = ModifiedPath(
                path=path_data['path'],
                namespace=path_data['namespace'],
                modified=path_data['modified'],
                action=path_data['action'],
            )
            modified_paths.append(modified_path)

        # Compute the affected directories and then update them if needed.
        dir_service = DirService()
        affected_dirs = dir_service.compute_affected_dirs(modified_paths)
        dir_service.update_affected_dirs(**affected_dirs)

        for tasks_to_delete in utils.chunk_generator(tasks):
            queue.delete_tasks(tasks_to_delete)

        return modified_paths

コード例 #4

0

ファイルを表示

ファイル: dirs.py プロジェクト: GoogleCloudPlatform/titan

  def process_next_window(self):
    """Lease one window-worth of tasks and update the corresponding dirs.

    Returns:
      A list of ModifiedPaths.
    """
    queue = taskqueue.Queue(TASKQUEUE_NAME)
    # Don't specify a tag; this pulls the oldest tasks of the same tag.
    tasks = queue.lease_tasks_by_tag(lease_seconds=TASKQUEUE_LEASE_SECONDS,
                                     max_tasks=TASKQUEUE_LEASE_MAX_TASKS)
    if not tasks:
      return {}

    # Keep leasing similar tasks if we hit the per-request leasing max.
    have_all_tasks = True if len(tasks) < TASKQUEUE_LEASE_MAX_TASKS else False
    while not have_all_tasks:
      tasks_in_window = queue.lease_tasks_by_tag(
          lease_seconds=TASKQUEUE_LEASE_SECONDS,
          max_tasks=TASKQUEUE_LEASE_MAX_TASKS,
          tag=tasks[0].tag)
      tasks.extend(tasks_in_window)
      if len(tasks_in_window) < TASKQUEUE_LEASE_MAX_TASKS:
        have_all_tasks = True

    # Package each task's data into a ModifiedPath and pass it on.
    # Don't deal with ordering or chronologically collapsing paths here.
    modified_paths = []
    for task in tasks:
      path_data = json.loads(task.payload)
      modified_path = ModifiedPath(
          path=path_data['path'],
          namespace=path_data['namespace'],
          modified=path_data['modified'],
          action=path_data['action'],
      )
      modified_paths.append(modified_path)

    # Compute the affected directories and then update them if needed.
    dir_service = DirService()
    affected_dirs = dir_service.compute_affected_dirs(modified_paths)
    dir_service.update_affected_dirs(**affected_dirs)

    for tasks_to_delete in utils.chunk_generator(tasks):
      queue.delete_tasks(tasks_to_delete)

    return modified_paths

コード例 #5

0

ファイルを表示

ファイル: dirs.py プロジェクト: dindinet/titan

class DirService(object):
    """Service for managing directory entities."""
    def compute_affected_dirs(self, modified_paths):
        """Compute which dirs are affected by path modifications.

    Args:
      modified_paths: A list of ModifiedPath objects.
    Raises:
      NamespaceMismatchError: If mixing namespaces.
    Returns:
      A dictionary containing 'dirs_with_adds' and 'dirs_with_deletes',
      both of which are sets of strings containing the affect dir paths.
    """
        if modified_paths:
            namespace = modified_paths[0].namespace
        # First, merge file path modifications.
        # Perform an in-order pass to get the final modified state of each file.
        sorted_paths = sorted(modified_paths, key=lambda path: path.modified)
        new_modified_paths = {}
        for modified_path in sorted_paths:
            if modified_path.namespace != namespace:
                raise NamespaceMismatchError(
                    'Namespace "{}" does not match namespace "{}".'.format(
                        modified_path.namespace, namespace))
            new_modified_paths[modified_path.path] = modified_path
        sorted_paths = sorted(new_modified_paths.values(),
                              key=lambda path: path.modified)

        # Second, generate the set of affected directory paths.
        # This does not need to collapse dirs which are added and then deleted,
        # the dir should be present in both lists if it is affected by both an
        # add and a delete.
        dirs_with_adds = set()
        dirs_with_deletes = set()
        for modified_path in sorted_paths:
            current_dirs = utils.split_path(modified_path.path)
            if modified_path.action == ModifiedPath.WRITE:
                dirs_with_adds = dirs_with_adds.union(set(current_dirs))
            elif modified_path.action == ModifiedPath.DELETE:
                dirs_with_deletes = dirs_with_deletes.union(set(current_dirs))

        # Ignore root dir; it's hard-coded elsewhere to always exist.
        dirs_with_adds.discard('/')
        dirs_with_deletes.discard('/')

        affected_dirs = {
            'namespace': namespace,
            'dirs_with_adds': dirs_with_adds,
            'dirs_with_deletes': dirs_with_deletes,
        }
        return affected_dirs

    @ndb.toplevel
    def update_affected_dirs(self,
                             dirs_with_adds,
                             dirs_with_deletes,
                             namespace=None,
                             async=False):
        """Manage changes to _TitanDir entities computed by compute_affected_dirs."""
        # Order deletes by depth first. This isn't actually by depth, but all we
        # need to guarantee here is that paths with common subdirs are deleted
        # depth-first, which can be accomplished by sorting in reverse
        # alphabetical order.
        dirs_with_deletes = sorted(list(dirs_with_deletes), reverse=True)

        # For every directory which contained a deleted file (including children),
        # check if the directory should disappear. It should disappear if:
        #   1. There are no files in the directory, and...
        #   2. There are no child directories, and...
        #   3. The directory path is not present in dirs_with_adds.
        dirs_paths_to_delete = []
        for path in dirs_with_deletes:
            if path in dirs_with_adds or files.Files.list(
                    path, namespace=namespace, limit=1, _internal=True):
                # The directory is marked for addition, or files still exist in it.
                continue
            subdirs = Dirs.list(path, limit=2)
            if len(subdirs) > 1:
                # Multiple subdirs exist, cannot delete dir.
                continue
            elif len(subdirs) == 1:
                # Handle the case where the only remaining subdir is marked for delete.
                if subdirs.values()[0].path not in dirs_paths_to_delete:
                    continue
            dirs_paths_to_delete.append(path)

        # Batch get all directory entities, both added and deleted.
        ns = namespace
        dir_keys = [
            ndb.Key(_TitanDir, path, namespace=ns)
            for path in dirs_paths_to_delete
        ]
        dir_keys += [
            ndb.Key(_TitanDir, path, namespace=ns) for path in dirs_with_adds
        ]
        existing_dir_ents = ndb.get_multi(dir_keys)
        # Transform into a dictionary mapping paths to existing entities:
        existing_dirs = {}
        for ent in existing_dir_ents:
            if ent:
                existing_dirs[ent.path] = ent

        changed_dir_ents = []
        for path in dirs_paths_to_delete:
            if path in existing_dirs:
                # Existing directory, mark as deleted.
                ent = existing_dirs[path]
                if ent.status == _STATUS_DELETED:
                    # Skip this entity entirely if it's already correct.
                    continue
                ent.status = _STATUS_DELETED
            else:
                # Missing directory entity, create a new one and mark as deleted.
                ent = _TitanDir(
                    # NDB properties:
                    id=path,
                    namespace=namespace,
                    # Model properties:
                    name=os.path.basename(path),
                    parent_path=os.path.dirname(path),
                    parent_paths=utils.split_path(path),
                    status=_STATUS_DELETED,
                )
            # Whitespace. Important.
            changed_dir_ents.append(ent)

        for path in dirs_with_adds:
            if path in existing_dirs:
                # Existing directory, make sure it's marked as available.
                ent = existing_dirs[path]
                if ent.status == _STATUS_AVAILABLE:
                    # Skip this entity entirely if it's already correct.
                    continue
                ent.status = _STATUS_AVAILABLE
            else:
                # Missing directory entity, create a new one and mark as available.
                ent = _TitanDir(
                    # NDB properties:
                    id=path,
                    namespace=namespace,
                    # Model properties:
                    name=os.path.basename(path),
                    parent_path=os.path.dirname(path),
                    parent_paths=utils.split_path(path),
                    status=_STATUS_AVAILABLE,
                )
            # Whitespace. Important.
            changed_dir_ents.append(ent)

        for dir_ents in utils.chunk_generator(changed_dir_ents,
                                              chunk_size=100):
            if not async:
                ndb.put_multi(dir_ents)
            else:
                ndb.put_multi_async(dir_ents)