def step_init(self, flow, graph, step, decos, environment, flow_datastore,
                  logger):
        # Executing Kubernetes jobs requires a non-local datastore at the
        # moment.
        # TODO: To support MiniKube we need to enable local datastore execution.
        if flow_datastore.TYPE != "s3":
            raise KubernetesException(
                "The *@kubernetes* decorator requires --datastore=s3 "
                "at the moment.")

        # Set internal state.
        self.logger = logger
        self.environment = environment
        self.step = step
        self.flow_datastore = flow_datastore
        for deco in decos:
            if isinstance(deco, ResourcesDecorator):
                for k, v in deco.attributes.items():
                    # We use the larger of @resources and @k8s attributes
                    # TODO: Fix https://github.com/Netflix/metaflow/issues/467
                    my_val = self.attributes.get(k)
                    if not (my_val is None and v is None):
                        self.attributes[k] = str(
                            max(int(my_val or 0), int(v or 0)))

        # Set run time limit for the Kubernetes job.
        self.run_time_limit = get_run_time_limit_for_task(decos)
        if self.run_time_limit < 60:
            raise KubernetesException(
                "The timeout for step *{step}* should be "
                "at least 60 seconds for execution on "
                "Kubernetes.".format(step=step))
Exemple #2
0
    def step_init(self, flow, graph, step, decos, environment, flow_datastore,
                  logger):
        if flow_datastore.TYPE != "s3":
            raise BatchException(
                "The *@batch* decorator requires --datastore=s3.")

        # Set internal state.
        self.logger = logger
        self.environment = environment
        self.step = step
        self.flow_datastore = flow_datastore
        for deco in decos:
            if isinstance(deco, ResourcesDecorator):
                for k, v in deco.attributes.items():
                    # We use the larger of @resources and @batch attributes
                    # TODO: Fix https://github.com/Netflix/metaflow/issues/467
                    my_val = self.attributes.get(k)
                    if not (my_val is None and v is None):
                        self.attributes[k] = str(
                            max(int(my_val or 0), int(v or 0)))

        # Set run time limit for the AWS Batch job.
        self.run_time_limit = get_run_time_limit_for_task(decos)
        if self.run_time_limit < 60:
            raise BatchException(
                "The timeout for step *{step}* should be at "
                "least 60 seconds for execution on AWS Batch.".format(
                    step=step))
Exemple #3
0
    def step_init(self, flow, graph, step, decos, environment, datastore, logger):
        if datastore.TYPE != 's3':
            raise BatchException('The *@batch* decorator requires --datastore=s3.')

        self.logger = logger
        self.environment = environment
        self.step = step
        for deco in decos:
            if isinstance(deco, ResourcesDecorator):
                for k, v in deco.attributes.items():
                    # we use the larger of @resources and @batch attributes
                    my_val = self.attributes.get(k)
                    if not (my_val is None and v is None):
                        self.attributes[k] = str(max(int(my_val or 0), int(v or 0)))
        self.run_time_limit = get_run_time_limit_for_task(decos)
        if self.run_time_limit < 60:
            raise BatchException('The timeout for step *{step}* should be at '
                'least 60 seconds for execution on AWS Batch'.format(step=step))
    def step_init(self, flow, graph, step, decos, environment, flow_datastore,
                  logger):
        if flow_datastore.TYPE != "s3":
            raise BatchException(
                "The *@batch* decorator requires --datastore=s3.")

        # Set internal state.
        self.logger = logger
        self.environment = environment
        self.step = step
        self.flow_datastore = flow_datastore

        self.attributes.update(
            compute_resource_attributes(decos, self, self.resource_defaults))

        # Set run time limit for the AWS Batch job.
        self.run_time_limit = get_run_time_limit_for_task(decos)
        if self.run_time_limit < 60:
            raise BatchException(
                "The timeout for step *{step}* should be at "
                "least 60 seconds for execution on AWS Batch.".format(
                    step=step))
Exemple #5
0
    def container_template(self, node):
        """
        Returns an argo container template spec. to execute a step
        """
        attr = parse_step_decorator(node, ArgoStepDecorator)
        env_decorator = parse_step_decorator(node, EnvironmentDecorator)
        retry_decorator = parse_step_decorator(node, RetryDecorator)
        catch_decorator = parse_step_decorator(node, CatchDecorator)
        res_decorator = parse_step_decorator(node, ResourcesDecorator)
        k8s_decorator = parse_step_decorator(node, KubernetesDecorator)
        resources = merge_resources(
            res_decorator, {
                k: v
                for k, v in k8s_decorator.items()
                if k in ResourcesDecorator.defaults
            })
        image = attr.get('image') or k8s_decorator.get(
            'image') or self._default_image()
        env, env_from = self._prepare_environment(attr, env_decorator)
        res = self._resources(resources)
        volume_mounts = attr.get('volumeMounts', [])
        volume_mounts.append(self._shared_memory(resources))

        user_code_retries = retry_decorator.get('times', 0)
        total_retries = user_code_retries + 1 if catch_decorator else user_code_retries
        retry_count = '{{retries}}' if total_retries else '0'
        cmd = self._commands(node, retry_count, user_code_retries)

        metadata = {
            'labels': {
                **attr.get('labels', {}),
                **self.attributes['labels'],
                'metaflow/step_name':
                sanitize_label_value(dns_name(node.name)),
                'app.kubernetes.io/name':
                'metaflow-task',
                'app.kubernetes.io/part-of':
                'metaflow',
                'app.kubernetes.io/created-by':
                get_username(),
            },
            'annotations': {
                **attr.get('annotations', {}),
                **self.attributes['annotations'],

                # should be a label but cannot sanitize argo variables
                'metaflow/attempt':
                retry_count,
            },
        }
        metadata['labels'].update(self.system_tags)

        template = {
            'name': dns_name(node.name),
            'metadata': metadata,
            'activeDeadlineSeconds':
            get_run_time_limit_for_task(node.decorators),
            'inputs': {
                'parameters': [{
                    'name': 'input-paths'
                }],
                'artifacts': attr.get('input_artifacts'),
            },
            'outputs': {
                'parameters': [{
                    'name': 'task-id',
                    'value': '{{pod.name}}'
                }],
                'artifacts': attr.get('output_artifacts')
            },
            'nodeSelector': attr.get('nodeSelector'),
            'container': {
                'image': image,
                'volumeMounts': volume_mounts,
                'command': [cmd[0]],
                'args': cmd[1:],
                'env': env,
                'envFrom': env_from,
                'resources': {
                    'requests': res,
                    'limits': res
                }
            },
        }

        if total_retries:
            template['retryStrategy'] = {
                'retryPolicy': 'Always',
                # fallback_step for @catch is only executed if retry_count > user_code_retries
                'limit': str(total_retries),
                'backoff': {
                    'duration':
                    '%sm' % str(retry_decorator['minutes_between_retries']
                                if user_code_retries else 0),
                }
            }

        if self._is_foreach_first_child(node):
            template['inputs']['parameters'].append({'name': 'split-index'})

        if node.type == 'foreach':
            template['outputs']['parameters'].append({
                'name': 'num-splits',
                'valueFrom': {
                    'path': ArgoInternalStepDecorator.splits_file_path
                }
            })

        return template
Exemple #6
0
    def step_init(self, flow, graph, step, decos, environment, flow_datastore,
                  logger):
        # Executing Kubernetes jobs requires a non-local datastore.
        if flow_datastore.TYPE != "s3":
            raise KubernetesException(
                "The *@kubernetes* decorator requires --datastore=s3 at the moment."
            )

        # Set internal state.
        self.logger = logger
        self.environment = environment
        self.step = step
        self.flow_datastore = flow_datastore

        if any([deco.name == "batch" for deco in decos]):
            raise MetaflowException(
                "Step *{step}* is marked for execution both on AWS Batch and "
                "Kubernetes. Please use one or the other.".format(step=step))

        for deco in decos:
            if getattr(deco, "IS_PARALLEL", False):
                raise KubernetesException(
                    "@kubernetes does not support parallel execution currently."
                )

        # Set run time limit for the Kubernetes job.
        self.run_time_limit = get_run_time_limit_for_task(decos)
        if self.run_time_limit < 60:
            raise KubernetesException(
                "The timeout for step *{step}* should be at least 60 seconds for "
                "execution on Kubernetes.".format(step=step))

        for deco in decos:
            if isinstance(deco, ResourcesDecorator):
                for k, v in deco.attributes.items():
                    # TODO: Special case GPUs when they are introduced in @resources.
                    if k in self.attributes:
                        if self.defaults[k] is None:
                            # skip if expected value isn't an int/float
                            continue
                        # We use the larger of @resources and @batch attributes
                        # TODO: Fix https://github.com/Netflix/metaflow/issues/467
                        my_val = self.attributes.get(k)
                        if not (my_val is None and v is None):
                            self.attributes[k] = str(
                                max(float(my_val or 0), float(v or 0)))

        # Check GPU vendor.
        if self.attributes["gpu_vendor"].lower() not in ("amd", "nvidia"):
            raise KubernetesException(
                "GPU vendor *{}* for step *{step}* is not currently supported."
                .format(self.attributes["gpu_vendor"], step=step))

        # CPU, Disk, and Memory values should be greater than 0.
        for attr in ["cpu", "disk", "memory"]:
            if not (isinstance(self.attributes[attr],
                               (int, unicode, basestring, float))
                    and float(self.attributes[attr]) > 0):
                raise KubernetesException(
                    "Invalid {} value *{}* for step *{step}*; it should be greater than 0"
                    .format(attr, self.attributes[attr], step=step))

        if self.attributes["gpu"] is not None and not (
                isinstance(self.attributes["gpu"], (int, unicode, basestring))
                and float(self.attributes["gpu"]).is_integer()):
            raise KubernetesException(
                "Invalid GPU value *{}* for step *{step}*; it should be an integer"
                .format(self.attributes["gpu"], step=step))