def needs_chunking(self, attr: str, data: Dict) -> bool: """ Recursively analyses the data and identifies if the current level of data should be chunked. This could happen if either isolate_attr marker in the current scope or recursively in any of sub-elements. :param attr: Name of attribute you want to check for chunking. :param data: Input dictionary to analyse. """ attrs = single_or_plural(attr) isolate_attrs = [f"isolate_{a}" for a in attrs ] + [f"max_{a}_per_batch" for a in attrs] root_isolate_attrs = self.get_isolate_attributes_from_job(data) if any(data[x] for x in isolate_attrs if x in data): logger.debug( f"needs_chunking(): Got requirement to isolate {attr} in the current scope: {data}" ) return True next_attr = self.get_next_chunkable_attr(attr) logger.debug( f"needs_chunking(): Found next attr {next_attr}, for {attr} from {data}" ) # We are not yet lowest level going recursive if next_attr: for a in attrs: current_vals = get_list_of_multiple_or_one_or_empty_from_dict( data, a) logger.debug( f"needs_chunking(): For {a} got current_vals: {current_vals} from {data}. " f"Analysing {next_attr}") for val in current_vals: for name, subdata in val.items(): logger.debug( f"needs_chunking(): Going recursive for {next_attr} in {subdata}" ) if isinstance(subdata, dict) and self.needs_chunking( next_attr, { **subdata, **root_isolate_attrs }): logger.debug( f"needs_chunking(): Returning True for {next_attr} from {subdata}" ) return True return False
def chunk_job(self, job: dict, skeleton: Dict = None, attr: str = None) -> List[Dict]: """ Recursively parses a job, validates everything and chunks to simple tasks what should be chunked. The Scenario of chunking and isolation is worth another story, so you should put a link here once it is ready. """ data = [] skeleton = deepcopy(skeleton) or {} job = deepcopy(job) # The current attribute we are looking for in this iteration or the first one of preconfigured chunkables. attr = attr or self.chunkable_attrs[0] if self.chunkable_attrs else None # We have to return here the full job to let it work correctly with recursive calls. if not attr: return [{**job, **skeleton}] logger.debug( f"Testing for chunking {attr} from {job} with skeleton {skeleton}") # First of all decide whether we need to chunk current job (or a sub-job if called recursively). if self.needs_chunking(plural(attr), job): # Next attribute is either name of attribute according to config, or None if we are already in last level. next_attr = self.get_next_chunkable_attr(attr) # Here and many places further we support both single and plural versions of attribute names. for possible_attr in single_or_plural(attr): current_vals = get_list_of_multiple_or_one_or_empty_from_dict( job, possible_attr) if not current_vals: continue # This is not the `skeleton` received during the call, but the remaining parts of the `job`, # not related to current `attr` job_skeleton = { k: v for k, v in job.items() if k not in [possible_attr, f"isolate_{attr}s"] } logger.debug( f"For {possible_attr} we got current_vals: {current_vals} from {job}, " f"leaving job_skeleton: {job_skeleton}") # For dictionaries we have to either go deeper recursively, or just flatten keys if values are None-s. if all(isinstance(v, dict) for v in current_vals): for val in current_vals: for name, subdata in val.items(): logger.debug( f"SubIterating `{name}` with {subdata}") task = deepcopy(skeleton) task.update(job_skeleton) task[plural(attr)] = [name] if isinstance(subdata, dict): # print(f'DICT {subdata}, NEXTATTR {next_attr}') if not next_attr: # If there is no lower level configured to chunk, just keep this subdata in payload task.update(subdata) data.append(task) # raise InvalidJob(f"Unexpected dictionary for unchunkable attribute: {attr}. " # f"In order to chunk this, you should support this level in: " # f"`config.job_schema.chunkable_attrs`. " # f"If you want to pass custom payload - put it as `payload` in " # f"your job. Job was: {job}") else: logger.debug( f"Call recursive for {next_attr} from subdata: {subdata}" ) data.extend( self.chunk_job(job=subdata, skeleton=task, attr=next_attr)) # If None-s we just add a task. `Name` (which is actually a value in this scenario) # was already added when creating task skeleton. elif subdata is None: logger.debug( f"Appending task to data for {name} from {val}" ) data.append(task) else: raise InvalidJob( f"Unsupported type of val: {subdata} for attribute {possible_attr}" ) # If current vals are not dictionaries, we just validate that they are flat supported values else: vals = self.validate_list_of_vals(current_vals) for val in vals: task = deepcopy(skeleton) task.update(job_skeleton) task[plural(attr)] = [val] data.append(task) else: logger.debug( f"No need for chunking for attr: {attr} in job: {job}. Current skeleton is: {skeleton}" ) task = skeleton for a in single_or_plural(attr): if a in job: attr_value = job.pop(a, None) if attr_value: try: vals = self.validate_list_of_vals(attr_value) task[plural(attr)] = vals except InvalidJob: logger.warning(f"Caught InvalidJob exception.") # If a custom payload is not following the chunking convention - just translate it as is. # And return the pop-ed value back to the job. job[a] = attr_value break else: logger.error(f"Did not find values for {attr} in job: {job}") # Populate the remaining parts of the job back to task. task.update(job) logger.debug(f"Appending task to data: {task}") data.append(task) return data
def chunk_job(self, job: dict, skeleton: Dict = None, attr: str = None) -> List[Dict]: """ Recursively parses a job, validates everything and chunks to simple tasks what should be chunked. The Scenario of chunking and isolation is worth another story, so you should put a link here once it is ready. """ data = [] skeleton = deepcopy(skeleton) or {} job = deepcopy(job) # The current attribute we are looking for in this iteration or the first one of preconfigured chunkables. attr = attr or self.chunkable_attrs[0] if self.chunkable_attrs else None # We have to return here the full job to let it work correctly with recursive calls. if not attr: return [{**job, **skeleton}] # If we shall need batching of flat vals of this attr we find out the batch size. # First we search in job (means the current level of recursive subdata being chunked. # If not specified per job, we try the setting inherited from level(s) upper probably even the root of main job. MAX_BATCH = 1000000 # This is not configurable! batch_size = int( job.get(f'max_{plural(attr)}_per_batch', skeleton.get(f'max_{plural(attr)}_per_batch', MAX_BATCH))) def push_list_chunks(): """ Appends chunks of lists using current skeleton and vals to chunk. """ for v in chunks(vals, batch_size): data.append({**task_skeleton, **{plural(attr): v}}) logger.debug( f"Testing for chunking {attr} from {job} with skeleton {skeleton}") # First of all decide whether we need to chunk current job (or a sub-job if called recursively). if self.needs_chunking(plural(attr), {**job, **skeleton}): # Force batches to isolate if we shall be dealing with flat data. # But we still respect the `max_PARAM_per_batch` if it is provided in job. # Having batch_size == MAX_BATCH asserts that we had batch_size = 1 if batch_size == MAX_BATCH else batch_size # Next attribute is either name of attribute according to config, or None if we are already in last level. next_attr = self.get_next_chunkable_attr(attr) logger.debug(f"Next attr: {next_attr}") # Here and many places further we support both single and plural versions of attribute names. for possible_attr in single_or_plural(attr): logger.debug(f"Iterating possible: {possible_attr}") current_vals = get_list_of_multiple_or_one_or_empty_from_dict( job, possible_attr) if not current_vals: continue # This is not the `skeleton` received during the call, but the remaining parts of the `job`, # not related to current `attr` job_skeleton = { k: v for k, v in job.items() if k not in [possible_attr] } logger.debug( f"For {possible_attr} we got current_vals: {current_vals} from {job}, " f"leaving job_skeleton: {job_skeleton}") task_skeleton = {**deepcopy(skeleton), **job_skeleton} # For dictionaries we have to either go deeper recursively, or just flatten keys if values are None-s. if all(isinstance(v, dict) for v in current_vals): for val in current_vals: if all(x is None for x in val.values()): logger.debug( f"Value {val} is all a dict of Nones. Need to flatten" ) vals = self.validate_list_of_vals(val) push_list_chunks() else: logger.debug( f"Real dictionary with values. Can't flatten it to dict: {val}" ) for name, subdata in val.items(): logger.debug( f"SubIterating `{name}` with {subdata}") # Merge parts of task task = { **deepcopy(task_skeleton), **{ plural(attr): [name] } } logger.debug(f"Task sample: {task}") if isinstance(subdata, dict): if not next_attr: # If there is no lower level configured to chunk, just keep this subdata in payload task.update(subdata) data.append(task) else: logger.debug( f"Call recursive for {next_attr} from subdata: {subdata}" ) data.extend( self.chunk_job(job=subdata, skeleton=task, attr=next_attr)) # If None-s we just add a task. `Name` (which is actually a value in this scenario) # was already added when creating task skeleton. elif subdata is None: logger.debug( f"Appending task to data for {name} from {val}" ) data.append(task) else: raise InvalidJob( f"Unsupported type of val: {subdata} for attribute {possible_attr}" ) # If current vals are not dictionaries, we just validate that they are flat supported values else: vals = self.validate_list_of_vals(current_vals) push_list_chunks() else: logger.debug( f"No need for chunking for attr: {attr} in job: {job}. Current skeleton is: {skeleton}" ) task_skeleton = {**deepcopy(skeleton)} for a in single_or_plural(attr): if a in job: attr_value = job.pop(a, None) if attr_value: try: vals = self.validate_list_of_vals(attr_value) push_list_chunks() # We are done here for not-chunkable attr. Return now. return data except InvalidJob: logger.warning(f"Caught InvalidJob exception.") # If a custom payload is not following the chunking convention - just translate it as is. # And return the pop-ed value back to the job. job[a] = attr_value break else: logger.error(f"Did not find values for {attr} in job: {job}") # Populate the remaining parts of the job back to task. task_skeleton.update(job) data.append(task_skeleton) return data