def testChunkGenerator(self): nums = range(0, 2501) # Verify default chunk size. new_nums = [] for i, some_nums in enumerate(utils.chunk_generator(nums)): new_nums += some_nums # Should have processed in 3 chunks (0, 1, 2): self.assertEqual(2, i) self.assertListEqual(new_nums, nums) # Verify chunk size bigger than input. new_nums = [] for i, some_nums in enumerate(utils.chunk_generator(nums, chunk_size=5000)): new_nums += some_nums # Should have processed in 1 chunk: self.assertEqual(0, i) self.assertListEqual(new_nums, nums)
def testChunkGenerator(self): nums = range(0, 2501) # Verify default chunk size. new_nums = [] for i, some_nums in enumerate(utils.chunk_generator(nums)): new_nums += some_nums # Should have processed in 3 chunks (0, 1, 2): self.assertEqual(2, i) self.assertListEqual(new_nums, nums) # Verify chunk size bigger than input. new_nums = [] for i, some_nums in enumerate( utils.chunk_generator(nums, chunk_size=5000)): new_nums += some_nums # Should have processed in 1 chunk: self.assertEqual(0, i) self.assertListEqual(new_nums, nums)
def process_next_window(self): """Lease one window-worth of tasks and update the corresponding dirs. Returns: A list of ModifiedPaths. """ queue = taskqueue.Queue(TASKQUEUE_NAME) # Don't specify a tag; this pulls the oldest tasks of the same tag. tasks = queue.lease_tasks_by_tag(lease_seconds=TASKQUEUE_LEASE_SECONDS, max_tasks=TASKQUEUE_LEASE_MAX_TASKS) if not tasks: return {} # Keep leasing similar tasks if we hit the per-request leasing max. have_all_tasks = True if len( tasks) < TASKQUEUE_LEASE_MAX_TASKS else False while not have_all_tasks: tasks_in_window = queue.lease_tasks_by_tag( lease_seconds=TASKQUEUE_LEASE_SECONDS, max_tasks=TASKQUEUE_LEASE_MAX_TASKS, tag=tasks[0].tag) tasks.extend(tasks_in_window) if len(tasks_in_window) < TASKQUEUE_LEASE_MAX_TASKS: have_all_tasks = True # Package each task's data into a ModifiedPath and pass it on. # Don't deal with ordering or chronologically collapsing paths here. modified_paths = [] for task in tasks: path_data = json.loads(task.payload) modified_path = ModifiedPath( path=path_data['path'], namespace=path_data['namespace'], modified=path_data['modified'], action=path_data['action'], ) modified_paths.append(modified_path) # Compute the affected directories and then update them if needed. dir_service = DirService() affected_dirs = dir_service.compute_affected_dirs(modified_paths) dir_service.update_affected_dirs(**affected_dirs) for tasks_to_delete in utils.chunk_generator(tasks): queue.delete_tasks(tasks_to_delete) return modified_paths
def process_next_window(self): """Lease one window-worth of tasks and update the corresponding dirs. Returns: A list of ModifiedPaths. """ queue = taskqueue.Queue(TASKQUEUE_NAME) # Don't specify a tag; this pulls the oldest tasks of the same tag. tasks = queue.lease_tasks_by_tag(lease_seconds=TASKQUEUE_LEASE_SECONDS, max_tasks=TASKQUEUE_LEASE_MAX_TASKS) if not tasks: return {} # Keep leasing similar tasks if we hit the per-request leasing max. have_all_tasks = True if len(tasks) < TASKQUEUE_LEASE_MAX_TASKS else False while not have_all_tasks: tasks_in_window = queue.lease_tasks_by_tag( lease_seconds=TASKQUEUE_LEASE_SECONDS, max_tasks=TASKQUEUE_LEASE_MAX_TASKS, tag=tasks[0].tag) tasks.extend(tasks_in_window) if len(tasks_in_window) < TASKQUEUE_LEASE_MAX_TASKS: have_all_tasks = True # Package each task's data into a ModifiedPath and pass it on. # Don't deal with ordering or chronologically collapsing paths here. modified_paths = [] for task in tasks: path_data = json.loads(task.payload) modified_path = ModifiedPath( path=path_data['path'], namespace=path_data['namespace'], modified=path_data['modified'], action=path_data['action'], ) modified_paths.append(modified_path) # Compute the affected directories and then update them if needed. dir_service = DirService() affected_dirs = dir_service.compute_affected_dirs(modified_paths) dir_service.update_affected_dirs(**affected_dirs) for tasks_to_delete in utils.chunk_generator(tasks): queue.delete_tasks(tasks_to_delete) return modified_paths
class DirService(object): """Service for managing directory entities.""" def compute_affected_dirs(self, modified_paths): """Compute which dirs are affected by path modifications. Args: modified_paths: A list of ModifiedPath objects. Raises: NamespaceMismatchError: If mixing namespaces. Returns: A dictionary containing 'dirs_with_adds' and 'dirs_with_deletes', both of which are sets of strings containing the affect dir paths. """ if modified_paths: namespace = modified_paths[0].namespace # First, merge file path modifications. # Perform an in-order pass to get the final modified state of each file. sorted_paths = sorted(modified_paths, key=lambda path: path.modified) new_modified_paths = {} for modified_path in sorted_paths: if modified_path.namespace != namespace: raise NamespaceMismatchError( 'Namespace "{}" does not match namespace "{}".'.format( modified_path.namespace, namespace)) new_modified_paths[modified_path.path] = modified_path sorted_paths = sorted(new_modified_paths.values(), key=lambda path: path.modified) # Second, generate the set of affected directory paths. # This does not need to collapse dirs which are added and then deleted, # the dir should be present in both lists if it is affected by both an # add and a delete. dirs_with_adds = set() dirs_with_deletes = set() for modified_path in sorted_paths: current_dirs = utils.split_path(modified_path.path) if modified_path.action == ModifiedPath.WRITE: dirs_with_adds = dirs_with_adds.union(set(current_dirs)) elif modified_path.action == ModifiedPath.DELETE: dirs_with_deletes = dirs_with_deletes.union(set(current_dirs)) # Ignore root dir; it's hard-coded elsewhere to always exist. dirs_with_adds.discard('/') dirs_with_deletes.discard('/') affected_dirs = { 'namespace': namespace, 'dirs_with_adds': dirs_with_adds, 'dirs_with_deletes': dirs_with_deletes, } return affected_dirs @ndb.toplevel def update_affected_dirs(self, dirs_with_adds, dirs_with_deletes, namespace=None, async=False): """Manage changes to _TitanDir entities computed by compute_affected_dirs.""" # Order deletes by depth first. This isn't actually by depth, but all we # need to guarantee here is that paths with common subdirs are deleted # depth-first, which can be accomplished by sorting in reverse # alphabetical order. dirs_with_deletes = sorted(list(dirs_with_deletes), reverse=True) # For every directory which contained a deleted file (including children), # check if the directory should disappear. It should disappear if: # 1. There are no files in the directory, and... # 2. There are no child directories, and... # 3. The directory path is not present in dirs_with_adds. dirs_paths_to_delete = [] for path in dirs_with_deletes: if path in dirs_with_adds or files.Files.list( path, namespace=namespace, limit=1, _internal=True): # The directory is marked for addition, or files still exist in it. continue subdirs = Dirs.list(path, limit=2) if len(subdirs) > 1: # Multiple subdirs exist, cannot delete dir. continue elif len(subdirs) == 1: # Handle the case where the only remaining subdir is marked for delete. if subdirs.values()[0].path not in dirs_paths_to_delete: continue dirs_paths_to_delete.append(path) # Batch get all directory entities, both added and deleted. ns = namespace dir_keys = [ ndb.Key(_TitanDir, path, namespace=ns) for path in dirs_paths_to_delete ] dir_keys += [ ndb.Key(_TitanDir, path, namespace=ns) for path in dirs_with_adds ] existing_dir_ents = ndb.get_multi(dir_keys) # Transform into a dictionary mapping paths to existing entities: existing_dirs = {} for ent in existing_dir_ents: if ent: existing_dirs[ent.path] = ent changed_dir_ents = [] for path in dirs_paths_to_delete: if path in existing_dirs: # Existing directory, mark as deleted. ent = existing_dirs[path] if ent.status == _STATUS_DELETED: # Skip this entity entirely if it's already correct. continue ent.status = _STATUS_DELETED else: # Missing directory entity, create a new one and mark as deleted. ent = _TitanDir( # NDB properties: id=path, namespace=namespace, # Model properties: name=os.path.basename(path), parent_path=os.path.dirname(path), parent_paths=utils.split_path(path), status=_STATUS_DELETED, ) # Whitespace. Important. changed_dir_ents.append(ent) for path in dirs_with_adds: if path in existing_dirs: # Existing directory, make sure it's marked as available. ent = existing_dirs[path] if ent.status == _STATUS_AVAILABLE: # Skip this entity entirely if it's already correct. continue ent.status = _STATUS_AVAILABLE else: # Missing directory entity, create a new one and mark as available. ent = _TitanDir( # NDB properties: id=path, namespace=namespace, # Model properties: name=os.path.basename(path), parent_path=os.path.dirname(path), parent_paths=utils.split_path(path), status=_STATUS_AVAILABLE, ) # Whitespace. Important. changed_dir_ents.append(ent) for dir_ents in utils.chunk_generator(changed_dir_ents, chunk_size=100): if not async: ndb.put_multi(dir_ents) else: ndb.put_multi_async(dir_ents)