Exemple #1
0
    def needs_chunking(self, attr: str, data: Dict) -> bool:
        """
        Recursively analyses the data and identifies if the current level of data should be chunked.
        This could happen if either isolate_attr marker in the current scope or recursively in any of sub-elements.

        :param attr:    Name of attribute you want to check for chunking.
        :param data:    Input dictionary to analyse.
        """

        attrs = single_or_plural(attr)
        isolate_attrs = [f"isolate_{a}" for a in attrs
                         ] + [f"max_{a}_per_batch" for a in attrs]

        root_isolate_attrs = self.get_isolate_attributes_from_job(data)

        if any(data[x] for x in isolate_attrs if x in data):
            logger.debug(
                f"needs_chunking(): Got requirement to isolate {attr} in the current scope: {data}"
            )
            return True

        next_attr = self.get_next_chunkable_attr(attr)

        logger.debug(
            f"needs_chunking(): Found next attr {next_attr}, for {attr} from {data}"
        )
        # We are not yet lowest level going recursive
        if next_attr:
            for a in attrs:
                current_vals = get_list_of_multiple_or_one_or_empty_from_dict(
                    data, a)
                logger.debug(
                    f"needs_chunking(): For {a} got current_vals: {current_vals} from {data}. "
                    f"Analysing {next_attr}")

                for val in current_vals:

                    for name, subdata in val.items():
                        logger.debug(
                            f"needs_chunking(): Going recursive for {next_attr} in {subdata}"
                        )
                        if isinstance(subdata, dict) and self.needs_chunking(
                                next_attr, {
                                    **subdata,
                                    **root_isolate_attrs
                                }):
                            logger.debug(
                                f"needs_chunking(): Returning True for {next_attr} from {subdata}"
                            )
                            return True

        return False
Exemple #2
0
    def chunk_job(self,
                  job: dict,
                  skeleton: Dict = None,
                  attr: str = None) -> List[Dict]:
        """
        Recursively parses a job, validates everything and chunks to simple tasks what should be chunked.
        The Scenario of chunking and isolation is worth another story, so you should put a link here once it is ready.
        """

        data = []
        skeleton = deepcopy(skeleton) or {}
        job = deepcopy(job)

        # The current attribute we are looking for in this iteration or the first one of preconfigured chunkables.
        attr = attr or self.chunkable_attrs[0] if self.chunkable_attrs else None

        # We have to return here the full job to let it work correctly with recursive calls.
        if not attr:
            return [{**job, **skeleton}]

        logger.debug(
            f"Testing for chunking {attr} from {job} with skeleton {skeleton}")

        # First of all decide whether we need to chunk current job (or a sub-job if called recursively).
        if self.needs_chunking(plural(attr), job):

            # Next attribute is either name of attribute according to config, or None if we are already in last level.
            next_attr = self.get_next_chunkable_attr(attr)

            # Here and many places further we support both single and plural versions of attribute names.
            for possible_attr in single_or_plural(attr):
                current_vals = get_list_of_multiple_or_one_or_empty_from_dict(
                    job, possible_attr)
                if not current_vals:
                    continue

                # This is not the `skeleton` received during the call, but the remaining parts of the `job`,
                # not related to current `attr`
                job_skeleton = {
                    k: v
                    for k, v in job.items()
                    if k not in [possible_attr, f"isolate_{attr}s"]
                }
                logger.debug(
                    f"For {possible_attr} we got current_vals: {current_vals} from {job}, "
                    f"leaving job_skeleton: {job_skeleton}")

                # For dictionaries we have to either go deeper recursively, or just flatten keys if values are None-s.
                if all(isinstance(v, dict) for v in current_vals):
                    for val in current_vals:
                        for name, subdata in val.items():
                            logger.debug(
                                f"SubIterating `{name}` with {subdata}")

                            task = deepcopy(skeleton)
                            task.update(job_skeleton)
                            task[plural(attr)] = [name]

                            if isinstance(subdata, dict):
                                # print(f'DICT {subdata}, NEXTATTR {next_attr}')
                                if not next_attr:
                                    # If there is no lower level configured to chunk, just keep this subdata in payload
                                    task.update(subdata)
                                    data.append(task)
                                    # raise InvalidJob(f"Unexpected dictionary for unchunkable attribute: {attr}. "
                                    #                  f"In order to chunk this, you should support this level in: "
                                    #                  f"`config.job_schema.chunkable_attrs`. "
                                    #                  f"If you want to pass custom payload - put it as `payload` in "
                                    #                  f"your job. Job was: {job}")
                                else:
                                    logger.debug(
                                        f"Call recursive for {next_attr} from subdata: {subdata}"
                                    )
                                    data.extend(
                                        self.chunk_job(job=subdata,
                                                       skeleton=task,
                                                       attr=next_attr))

                            # If None-s we just add a task. `Name` (which is actually a value in this scenario)
                            # was already added when creating task skeleton.
                            elif subdata is None:
                                logger.debug(
                                    f"Appending task to data for {name} from {val}"
                                )
                                data.append(task)

                            else:
                                raise InvalidJob(
                                    f"Unsupported type of val: {subdata} for attribute {possible_attr}"
                                )

                # If current vals are not dictionaries, we just validate that they are flat supported values
                else:
                    vals = self.validate_list_of_vals(current_vals)

                    for val in vals:
                        task = deepcopy(skeleton)
                        task.update(job_skeleton)
                        task[plural(attr)] = [val]
                        data.append(task)

        else:
            logger.debug(
                f"No need for chunking for attr: {attr} in job: {job}. Current skeleton is: {skeleton}"
            )
            task = skeleton

            for a in single_or_plural(attr):
                if a in job:
                    attr_value = job.pop(a, None)
                    if attr_value:
                        try:
                            vals = self.validate_list_of_vals(attr_value)
                            task[plural(attr)] = vals
                        except InvalidJob:
                            logger.warning(f"Caught InvalidJob exception.")
                            # If a custom payload is not following the chunking convention - just translate it as is.
                            # And return the pop-ed value back to the job.
                            job[a] = attr_value
                        break
            else:
                logger.error(f"Did not find values for {attr} in job: {job}")
            # Populate the remaining parts of the job back to task.
            task.update(job)

            logger.debug(f"Appending task to data: {task}")
            data.append(task)

        return data
Exemple #3
0
    def chunk_job(self,
                  job: dict,
                  skeleton: Dict = None,
                  attr: str = None) -> List[Dict]:
        """
        Recursively parses a job, validates everything and chunks to simple tasks what should be chunked.
        The Scenario of chunking and isolation is worth another story, so you should put a link here once it is ready.
        """

        data = []
        skeleton = deepcopy(skeleton) or {}
        job = deepcopy(job)

        # The current attribute we are looking for in this iteration or the first one of preconfigured chunkables.
        attr = attr or self.chunkable_attrs[0] if self.chunkable_attrs else None

        # We have to return here the full job to let it work correctly with recursive calls.
        if not attr:
            return [{**job, **skeleton}]

        # If we shall need batching of flat vals of this attr we find out the batch size.
        # First we search in job (means the current level of recursive subdata being chunked.
        # If not specified per job, we try the setting inherited from level(s) upper probably even the root of main job.
        MAX_BATCH = 1000000  # This is not configurable!
        batch_size = int(
            job.get(f'max_{plural(attr)}_per_batch',
                    skeleton.get(f'max_{plural(attr)}_per_batch', MAX_BATCH)))

        def push_list_chunks():
            """ Appends chunks of lists using current skeleton and vals to chunk. """
            for v in chunks(vals, batch_size):
                data.append({**task_skeleton, **{plural(attr): v}})

        logger.debug(
            f"Testing for chunking {attr} from {job} with skeleton {skeleton}")
        # First of all decide whether we need to chunk current job (or a sub-job if called recursively).
        if self.needs_chunking(plural(attr), {**job, **skeleton}):

            # Force batches to isolate if we shall be dealing with flat data.
            # But we still respect the `max_PARAM_per_batch` if it is provided in job.
            # Having batch_size == MAX_BATCH asserts that we had
            batch_size = 1 if batch_size == MAX_BATCH else batch_size

            # Next attribute is either name of attribute according to config, or None if we are already in last level.
            next_attr = self.get_next_chunkable_attr(attr)
            logger.debug(f"Next attr: {next_attr}")

            # Here and many places further we support both single and plural versions of attribute names.
            for possible_attr in single_or_plural(attr):
                logger.debug(f"Iterating possible: {possible_attr}")
                current_vals = get_list_of_multiple_or_one_or_empty_from_dict(
                    job, possible_attr)
                if not current_vals:
                    continue

                # This is not the `skeleton` received during the call, but the remaining parts of the `job`,
                # not related to current `attr`
                job_skeleton = {
                    k: v
                    for k, v in job.items() if k not in [possible_attr]
                }
                logger.debug(
                    f"For {possible_attr} we got current_vals: {current_vals} from {job}, "
                    f"leaving job_skeleton: {job_skeleton}")

                task_skeleton = {**deepcopy(skeleton), **job_skeleton}

                # For dictionaries we have to either go deeper recursively, or just flatten keys if values are None-s.
                if all(isinstance(v, dict) for v in current_vals):
                    for val in current_vals:

                        if all(x is None for x in val.values()):
                            logger.debug(
                                f"Value {val} is all a dict of Nones. Need to flatten"
                            )
                            vals = self.validate_list_of_vals(val)
                            push_list_chunks()

                        else:
                            logger.debug(
                                f"Real dictionary with values. Can't flatten it to dict: {val}"
                            )
                            for name, subdata in val.items():
                                logger.debug(
                                    f"SubIterating `{name}` with {subdata}")

                                # Merge parts of task
                                task = {
                                    **deepcopy(task_skeleton),
                                    **{
                                        plural(attr): [name]
                                    }
                                }
                                logger.debug(f"Task sample: {task}")

                                if isinstance(subdata, dict):
                                    if not next_attr:
                                        # If there is no lower level configured to chunk, just keep this subdata in payload
                                        task.update(subdata)
                                        data.append(task)
                                    else:
                                        logger.debug(
                                            f"Call recursive for {next_attr} from subdata: {subdata}"
                                        )
                                        data.extend(
                                            self.chunk_job(job=subdata,
                                                           skeleton=task,
                                                           attr=next_attr))

                                # If None-s we just add a task. `Name` (which is actually a value in this scenario)
                                # was already added when creating task skeleton.
                                elif subdata is None:
                                    logger.debug(
                                        f"Appending task to data for {name} from {val}"
                                    )
                                    data.append(task)
                                else:
                                    raise InvalidJob(
                                        f"Unsupported type of val: {subdata} for attribute {possible_attr}"
                                    )

                # If current vals are not dictionaries, we just validate that they are flat supported values
                else:
                    vals = self.validate_list_of_vals(current_vals)
                    push_list_chunks()

        else:
            logger.debug(
                f"No need for chunking for attr: {attr} in job: {job}. Current skeleton is: {skeleton}"
            )
            task_skeleton = {**deepcopy(skeleton)}
            for a in single_or_plural(attr):
                if a in job:
                    attr_value = job.pop(a, None)
                    if attr_value:
                        try:
                            vals = self.validate_list_of_vals(attr_value)
                            push_list_chunks()

                            # We are done here for not-chunkable attr. Return now.
                            return data

                        except InvalidJob:
                            logger.warning(f"Caught InvalidJob exception.")
                            # If a custom payload is not following the chunking convention - just translate it as is.
                            # And return the pop-ed value back to the job.
                            job[a] = attr_value
                        break
            else:
                logger.error(f"Did not find values for {attr} in job: {job}")
            # Populate the remaining parts of the job back to task.
            task_skeleton.update(job)
            data.append(task_skeleton)

        return data