Beispiel #1
0
class NullMonitor(object):
    TYPE = "nullSidecarMonitor"

    def __init__(self, *args, **kwargs):
        # Currently passed flow and env as kwargs
        self._sidecar = Sidecar(self.TYPE)

    def start(self):
        return self._sidecar.start()

    def terminate(self):
        return self._sidecar.terminate()

    def send(self, msg):
        # Arbitrary message sending. Useful if you want to override some different
        # types of messages.
        self._sidecar.send(msg)

    @contextmanager
    def count(self, name):
        if self._sidecar.is_active:
            counter = Counter(name)
            counter.increment()
            payload = {"counter": counter.serialize()}
            msg = Message(MessageTypes.BEST_EFFORT, payload)
            yield
            self._sidecar.send(msg)
        else:
            yield

    @contextmanager
    def measure(self, name):
        if self._sidecar.is_active:
            timer = Timer(name + "_timer")
            counter = Counter(name + "_counter")
            timer.start()
            counter.increment()
            yield
            timer.end()
            payload = {
                "counter": counter.serialize(),
                "timer": timer.serialize()
            }
            msg = Message(MessageTypes.BEST_EFFORT, payload)
            self._sidecar.send(msg)
        else:
            yield

    def gauge(self, gauge):
        if self._sidecar.is_active:
            payload = {"gauge": gauge.serialize()}
            msg = Message(MessageTypes.BEST_EFFORT, payload)
            self._sidecar.send(msg)

    @classmethod
    def get_worker(cls):
        return None
Beispiel #2
0
    def task_pre_step(
        self,
        step_name,
        task_datastore,
        metadata,
        run_id,
        task_id,
        flow,
        graph,
        retry_count,
        max_retries,
        ubf_context,
        inputs,
    ):
        self.metadata = metadata
        self.task_datastore = task_datastore

        # task_pre_step may run locally if fallback is activated for @catch
        # decorator. In that scenario, we skip collecting Kubernetes execution
        # metadata. A rudimentary way to detect non-local execution is to
        # check for the existence of METAFLOW_KUBERNETES_WORKLOAD environment
        # variable.

        if "METAFLOW_KUBERNETES_WORKLOAD" in os.environ:
            meta = {}
            meta["kubernetes-pod-name"] = os.environ[
                "METAFLOW_KUBERNETES_POD_NAME"]
            meta["kubernetes-pod-namespace"] = os.environ[
                "METAFLOW_KUBERNETES_POD_NAMESPACE"]
            meta["kubernetes-pod-id"] = os.environ[
                "METAFLOW_KUBERNETES_POD_ID"]
            meta["kubernetes-pod-service-account-name"] = os.environ[
                "METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME"]
            # Unfortunately, there doesn't seem to be any straight forward way right
            # now to attach the Batch/v1 name - While we can rely on a hacky approach
            # given we know that the pod name is simply a unique suffix with a hyphen
            # delimiter to the Batch/v1 name - this approach will fail if the Batch/v1
            # name is closer to 63 chars where the pod name will truncate the Batch/v1
            # name.
            # if "ARGO_WORKFLOW_NAME" not in os.environ:
            #     meta["kubernetes-job-name"] = os.environ[
            #         "METAFLOW_KUBERNETES_POD_NAME"
            #     ].rpartition("-")[0]

            entries = [
                MetaDatum(field=k, value=v, type=k, tags=[])
                for k, v in meta.items()
            ]
            # Register book-keeping metadata for debugging.
            metadata.register_metadata(run_id, step_name, task_id, entries)

            # Start MFLog sidecar to collect task logs.
            self._save_logs_sidecar = Sidecar("save_logs_periodically")
            self._save_logs_sidecar.start()
Beispiel #3
0
class NullEventLogger(object):
    TYPE = "nullSidecarLogger"

    def __init__(self, *args, **kwargs):
        # Currently passed flow and env in kwargs
        self._sidecar = Sidecar(self.TYPE)

    def start(self):
        return self._sidecar.start()

    def terminate(self):
        return self._sidecar.terminate()

    def send(self, msg):
        # Arbitrary message sending. Useful if you want to override some different
        # types of messages.
        self._sidecar.send(msg)

    def log(self, payload):
        if self._sidecar.is_active:
            msg = Message(MessageTypes.BEST_EFFORT, payload)
            self._sidecar.send(msg)

    @classmethod
    def get_worker(cls):
        return None
Beispiel #4
0
    def _start_heartbeat(
        self, heartbeat_type, flow_id, run_id, step_name=None, task_id=None
    ):
        if self._already_started():
            # A single ServiceMetadataProvider instance can not start
            # multiple heartbeat side cars of any type/combination. Either a
            # single run heartbeat or a single task heartbeat can be started
            raise Exception("heartbeat already started")
        # create init message
        payload = {}
        if heartbeat_type == HeartbeatTypes.TASK:
            # create task heartbeat
            data = {
                "flow_id": flow_id,
                "run_number": run_id,
                "step_name": step_name,
                "task_id": task_id,
            }
            payload[HB_URL_KEY] = self.url_task_template.format(**data)
        elif heartbeat_type == HeartbeatTypes.RUN:
            # create run heartbeat
            data = {"flow_id": flow_id, "run_number": run_id}

            payload[HB_URL_KEY] = self.url_run_template.format(**data)
        else:
            raise Exception("invalid heartbeat type")
        payload["service_version"] = self.version()
        # start sidecar
        if self.version() is None or LooseVersion(self.version()) < LooseVersion(
            "2.0.4"
        ):
            # if old version of the service is running
            # then avoid running real heartbeat sidecar process
            self.sidecar = Sidecar("none")
        else:
            self.sidecar = Sidecar("heartbeat")
        self.sidecar.start()
        self.sidecar.send(Message(MessageTypes.BEST_EFFORT, payload))
Beispiel #5
0
 def __init__(self, *args, **kwargs):
     # Currently passed flow and env as kwargs
     self._sidecar = Sidecar(self.TYPE)
Beispiel #6
0
class KubernetesDecorator(StepDecorator):
    """
    Step decorator to specify that this step should execute on Kubernetes.

    This decorator indicates that your step should execute on Kubernetes. Note
    that you can apply this decorator automatically to all steps using the
    ```--with kubernetes``` argument when calling run/resume. Step level
    decorators within the code are overrides and will force a step to execute
    on Kubernetes regardless of the ```--with``` specification.

    To use, annotate your step as follows:
    ```
    @kubernetes
    @step
    def my_step(self):
        ...
    ```
    Parameters
    ----------
    cpu : int
        Number of CPUs required for this step. Defaults to 1. If @resources is
        also present, the maximum value from all decorators is used
    memory : int
        Memory size (in MB) required for this step. Defaults to 4096. If
        @resources is also present, the maximum value from all decorators is
        used
    disk : int
        Disk size (in MB) required for this step. Defaults to 10GB. If
        @resources is also present, the maximum value from all decorators is
        used
    image : string
        Docker image to use when launching on Kubernetes. If not specified, a
        default docker image mapping to the current version of Python is used
    """

    name = "kubernetes"
    defaults = {
        "cpu": "1",
        "memory": "4096",
        "disk": "10240",
        "image": None,
        "service_account": None,
        "secrets": None,  # e.g., mysecret
        "node_selector": None,  # e.g., kubernetes.io/os=linux
        "namespace": None,
        "gpu":
        None,  # value of 0 implies that the scheduled node should not have GPUs
        "gpu_vendor": None,
    }
    package_url = None
    package_sha = None
    run_time_limit = None

    def __init__(self, attributes=None, statically_defined=False):
        super(KubernetesDecorator, self).__init__(attributes,
                                                  statically_defined)

        if not self.attributes["namespace"]:
            self.attributes["namespace"] = KUBERNETES_NAMESPACE
        if not self.attributes["service_account"]:
            self.attributes["service_account"] = KUBERNETES_SERVICE_ACCOUNT
        if not self.attributes["gpu_vendor"]:
            self.attributes["gpu_vendor"] = KUBERNETES_GPU_VENDOR

        # TODO: Handle node_selector in a better manner. Currently it is special
        #       cased in kubernetes_client.py

        # If no docker image is explicitly specified, impute a default image.
        if not self.attributes["image"]:
            # If metaflow-config specifies a docker image, just use that.
            if KUBERNETES_CONTAINER_IMAGE:
                self.attributes["image"] = KUBERNETES_CONTAINER_IMAGE
            # If metaflow-config doesn't specify a docker image, assign a
            # default docker image.
            else:
                # Default to vanilla Python image corresponding to major.minor
                # version of the Python interpreter launching the flow.
                self.attributes["image"] = "python:%s.%s" % (
                    platform.python_version_tuple()[0],
                    platform.python_version_tuple()[1],
                )
        # Assign docker registry URL for the image.
        if not get_docker_registry(self.attributes["image"]):
            if KUBERNETES_CONTAINER_REGISTRY:
                self.attributes["image"] = "%s/%s" % (
                    KUBERNETES_CONTAINER_REGISTRY.rstrip("/"),
                    self.attributes["image"],
                )

    # Refer https://github.com/Netflix/metaflow/blob/master/docs/lifecycle.png
    def step_init(self, flow, graph, step, decos, environment, flow_datastore,
                  logger):
        # Executing Kubernetes jobs requires a non-local datastore.
        if flow_datastore.TYPE != "s3":
            raise KubernetesException(
                "The *@kubernetes* decorator requires --datastore=s3 at the moment."
            )

        # Set internal state.
        self.logger = logger
        self.environment = environment
        self.step = step
        self.flow_datastore = flow_datastore

        if any([deco.name == "batch" for deco in decos]):
            raise MetaflowException(
                "Step *{step}* is marked for execution both on AWS Batch and "
                "Kubernetes. Please use one or the other.".format(step=step))

        for deco in decos:
            if getattr(deco, "IS_PARALLEL", False):
                raise KubernetesException(
                    "@kubernetes does not support parallel execution currently."
                )

        # Set run time limit for the Kubernetes job.
        self.run_time_limit = get_run_time_limit_for_task(decos)
        if self.run_time_limit < 60:
            raise KubernetesException(
                "The timeout for step *{step}* should be at least 60 seconds for "
                "execution on Kubernetes.".format(step=step))

        for deco in decos:
            if isinstance(deco, ResourcesDecorator):
                for k, v in deco.attributes.items():
                    # TODO: Special case GPUs when they are introduced in @resources.
                    if k in self.attributes:
                        if self.defaults[k] is None:
                            # skip if expected value isn't an int/float
                            continue
                        # We use the larger of @resources and @batch attributes
                        # TODO: Fix https://github.com/Netflix/metaflow/issues/467
                        my_val = self.attributes.get(k)
                        if not (my_val is None and v is None):
                            self.attributes[k] = str(
                                max(float(my_val or 0), float(v or 0)))

        # Check GPU vendor.
        if self.attributes["gpu_vendor"].lower() not in ("amd", "nvidia"):
            raise KubernetesException(
                "GPU vendor *{}* for step *{step}* is not currently supported."
                .format(self.attributes["gpu_vendor"], step=step))

        # CPU, Disk, and Memory values should be greater than 0.
        for attr in ["cpu", "disk", "memory"]:
            if not (isinstance(self.attributes[attr],
                               (int, unicode, basestring, float))
                    and float(self.attributes[attr]) > 0):
                raise KubernetesException(
                    "Invalid {} value *{}* for step *{step}*; it should be greater than 0"
                    .format(attr, self.attributes[attr], step=step))

        if self.attributes["gpu"] is not None and not (
                isinstance(self.attributes["gpu"], (int, unicode, basestring))
                and float(self.attributes["gpu"]).is_integer()):
            raise KubernetesException(
                "Invalid GPU value *{}* for step *{step}*; it should be an integer"
                .format(self.attributes["gpu"], step=step))

    def package_init(self, flow, step_name, environment):
        try:
            # Kubernetes is a soft dependency.
            from kubernetes import client, config
        except (NameError, ImportError):
            raise KubernetesException(
                "Could not import module 'kubernetes'.\n\nInstall Kubernetes "
                "Python package (https://pypi.org/project/kubernetes/) first.\n"
                "You can install the module by executing - "
                "%s -m pip install kubernetes\n"
                "or equivalent through your favorite Python package manager." %
                sys.executable)

    def runtime_init(self, flow, graph, package, run_id):
        # Set some more internal state.
        self.flow = flow
        self.graph = graph
        self.package = package
        self.run_id = run_id

    def runtime_task_created(self, task_datastore, task_id, split_index,
                             input_paths, is_cloned, ubf_context):
        # To execute the Kubernetes job, the job container needs to have
        # access to the code package. We store the package in the datastore
        # which the pod is able to download as part of it's entrypoint.
        if not is_cloned:
            self._save_package_once(self.flow_datastore, self.package)

    def runtime_step_cli(self, cli_args, retry_count, max_user_code_retries,
                         ubf_context):
        if retry_count <= max_user_code_retries:
            # After all attempts to run the user code have failed, we don't need
            # to execute on Kubernetes anymore. We can execute possible fallback
            # code locally.
            cli_args.commands = ["kubernetes", "step"]
            cli_args.command_args.append(self.package_sha)
            cli_args.command_args.append(self.package_url)

            # --namespace is used to specify Metaflow namespace (a different
            # concept from k8s namespace).
            for k, v in self.attributes.items():
                if k == "namespace":
                    cli_args.command_options["k8s_namespace"] = v
                else:
                    cli_args.command_options[k] = v
            cli_args.command_options["run-time-limit"] = self.run_time_limit
            cli_args.entrypoint[0] = sys.executable

    def task_pre_step(
        self,
        step_name,
        task_datastore,
        metadata,
        run_id,
        task_id,
        flow,
        graph,
        retry_count,
        max_retries,
        ubf_context,
        inputs,
    ):
        self.metadata = metadata
        self.task_datastore = task_datastore

        # task_pre_step may run locally if fallback is activated for @catch
        # decorator. In that scenario, we skip collecting Kubernetes execution
        # metadata. A rudimentary way to detect non-local execution is to
        # check for the existence of METAFLOW_KUBERNETES_WORKLOAD environment
        # variable.

        if "METAFLOW_KUBERNETES_WORKLOAD" in os.environ:
            meta = {}
            meta["kubernetes-pod-name"] = os.environ[
                "METAFLOW_KUBERNETES_POD_NAME"]
            meta["kubernetes-pod-namespace"] = os.environ[
                "METAFLOW_KUBERNETES_POD_NAMESPACE"]
            meta["kubernetes-pod-id"] = os.environ[
                "METAFLOW_KUBERNETES_POD_ID"]
            meta["kubernetes-pod-service-account-name"] = os.environ[
                "METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME"]
            # Unfortunately, there doesn't seem to be any straight forward way right
            # now to attach the Batch/v1 name - While we can rely on a hacky approach
            # given we know that the pod name is simply a unique suffix with a hyphen
            # delimiter to the Batch/v1 name - this approach will fail if the Batch/v1
            # name is closer to 63 chars where the pod name will truncate the Batch/v1
            # name.
            # if "ARGO_WORKFLOW_NAME" not in os.environ:
            #     meta["kubernetes-job-name"] = os.environ[
            #         "METAFLOW_KUBERNETES_POD_NAME"
            #     ].rpartition("-")[0]

            entries = [
                MetaDatum(field=k, value=v, type=k, tags=[])
                for k, v in meta.items()
            ]
            # Register book-keeping metadata for debugging.
            metadata.register_metadata(run_id, step_name, task_id, entries)

            # Start MFLog sidecar to collect task logs.
            self._save_logs_sidecar = Sidecar("save_logs_periodically")
            self._save_logs_sidecar.start()

    def task_finished(self, step_name, flow, graph, is_task_ok, retry_count,
                      max_retries):
        # task_finished may run locally if fallback is activated for @catch
        # decorator.
        if "METAFLOW_KUBERNETES_WORKLOAD" in os.environ:
            # If `local` metadata is configured, we would need to copy task
            # execution metadata from the AWS Batch container to user's
            # local file system after the user code has finished execution.
            # This happens via datastore as a communication bridge.

            # TODO:  There is no guarantee that task_prestep executes before
            #        task_finished is invoked. That will result in AttributeError:
            #        'KubernetesDecorator' object has no attribute 'metadata' error.
            if self.metadata.TYPE == "local":
                # Note that the datastore is *always* Amazon S3 (see
                # runtime_task_created function).
                sync_local_metadata_to_datastore(DATASTORE_LOCAL_DIR,
                                                 self.task_datastore)

        try:
            self._save_logs_sidecar.terminate()
        except:
            # Best effort kill
            pass

    @classmethod
    def _save_package_once(cls, flow_datastore, package):
        if cls.package_url is None:
            cls.package_url, cls.package_sha = flow_datastore.save_data(
                [package.blob], len_hint=1)[0]
Beispiel #7
0
class ServiceMetadataProvider(MetadataProvider):
    TYPE = "service"

    _supports_attempt_gets = None
    _supports_tag_mutation = None

    def __init__(self, environment, flow, event_logger, monitor):
        super(ServiceMetadataProvider, self).__init__(
            environment, flow, event_logger, monitor
        )
        self.url_task_template = os.path.join(
            METADATA_SERVICE_URL,
            "flows/{flow_id}/runs/{run_number}/steps/{step_name}/tasks/{task_id}/heartbeat",
        )
        self.url_run_template = os.path.join(
            METADATA_SERVICE_URL, "flows/{flow_id}/runs/{run_number}/heartbeat"
        )
        self.sidecar = None

    @classmethod
    def compute_info(cls, val):
        v = val.rstrip("/")
        try:
            resp = requests.get(
                os.path.join(v, "ping"), headers=METADATA_SERVICE_HEADERS
            )
            resp.raise_for_status()
        except:  # noqa E722
            raise ValueError("Metaflow service [%s] unreachable." % v)
        return v

    @classmethod
    def default_info(cls):
        return METADATA_SERVICE_URL

    def version(self):
        return self._version(self._monitor)

    def new_run_id(self, tags=None, sys_tags=None):
        v, _ = self._new_run(tags=tags, sys_tags=sys_tags)
        return v

    def register_run_id(self, run_id, tags=None, sys_tags=None):
        try:
            # don't try to register an integer ID which was obtained
            # from the metadata service in the first place
            int(run_id)
            return False
        except ValueError:
            _, did_create = self._new_run(run_id, tags=tags, sys_tags=sys_tags)
            return did_create

    def new_task_id(self, run_id, step_name, tags=None, sys_tags=None):
        v, _ = self._new_task(run_id, step_name, tags=tags, sys_tags=sys_tags)
        return v

    def register_task_id(
        self, run_id, step_name, task_id, attempt=0, tags=None, sys_tags=None
    ):
        try:
            # don't try to register an integer ID which was obtained
            # from the metadata service in the first place
            int(task_id)
        except ValueError:
            _, did_create = self._new_task(
                run_id,
                step_name,
                task_id=task_id,
                attempt=attempt,
                tags=tags,
                sys_tags=sys_tags,
            )
            return did_create
        else:
            self._register_system_metadata(run_id, step_name, task_id, attempt)
            return False

    def _start_heartbeat(
        self, heartbeat_type, flow_id, run_id, step_name=None, task_id=None
    ):
        if self._already_started():
            # A single ServiceMetadataProvider instance can not start
            # multiple heartbeat side cars of any type/combination. Either a
            # single run heartbeat or a single task heartbeat can be started
            raise Exception("heartbeat already started")
        # create init message
        payload = {}
        if heartbeat_type == HeartbeatTypes.TASK:
            # create task heartbeat
            data = {
                "flow_id": flow_id,
                "run_number": run_id,
                "step_name": step_name,
                "task_id": task_id,
            }
            payload[HB_URL_KEY] = self.url_task_template.format(**data)
        elif heartbeat_type == HeartbeatTypes.RUN:
            # create run heartbeat
            data = {"flow_id": flow_id, "run_number": run_id}

            payload[HB_URL_KEY] = self.url_run_template.format(**data)
        else:
            raise Exception("invalid heartbeat type")
        payload["service_version"] = self.version()
        # start sidecar
        if self.version() is None or LooseVersion(self.version()) < LooseVersion(
            "2.0.4"
        ):
            # if old version of the service is running
            # then avoid running real heartbeat sidecar process
            self.sidecar = Sidecar("none")
        else:
            self.sidecar = Sidecar("heartbeat")
        self.sidecar.start()
        self.sidecar.send(Message(MessageTypes.BEST_EFFORT, payload))

    def start_run_heartbeat(self, flow_id, run_id):
        self._start_heartbeat(HeartbeatTypes.RUN, flow_id, run_id)

    def start_task_heartbeat(self, flow_id, run_id, step_name, task_id):
        self._start_heartbeat(HeartbeatTypes.TASK, flow_id, run_id, step_name, task_id)

    def _already_started(self):
        return self.sidecar is not None

    def stop_heartbeat(self):
        self.sidecar.terminate()

    def register_data_artifacts(
        self, run_id, step_name, task_id, attempt_id, artifacts
    ):
        url = ServiceMetadataProvider._obj_path(
            self._flow_name, run_id, step_name, task_id
        )
        url += "/artifact"
        data = self._artifacts_to_json(
            run_id, step_name, task_id, attempt_id, artifacts
        )
        self._request(self._monitor, url, "POST", data)

    def register_metadata(self, run_id, step_name, task_id, metadata):
        url = ServiceMetadataProvider._obj_path(
            self._flow_name, run_id, step_name, task_id
        )
        url += "/metadata"
        data = self._metadata_to_json(run_id, step_name, task_id, metadata)
        self._request(self._monitor, url, "POST", data)

    @classmethod
    def _mutate_user_tags_for_run(
        cls, flow_id, run_id, tags_to_add=None, tags_to_remove=None
    ):
        min_service_version_with_tag_mutation = "2.3.0"
        if cls._supports_tag_mutation is None:
            version = cls._version(None)
            cls._supports_tag_mutation = version is not None and LooseVersion(
                version
            ) >= LooseVersion(min_service_version_with_tag_mutation)
        if not cls._supports_tag_mutation:
            raise ServiceException(
                "Adding or removing tags on a run requires the Metaflow service to be "
                "at least version %s. Please upgrade your service."
                % (min_service_version_with_tag_mutation,)
            )

        url = ServiceMetadataProvider._obj_path(flow_id, run_id) + "/tag/mutate"
        tag_mutation_data = {
            # mutate_user_tags_for_run() should have already ensured that this is a list, so let's be tolerant here
            "tags_to_add": list(tags_to_add or []),
            "tags_to_remove": list(tags_to_remove or []),
        }
        tries = 1
        status_codes_seen = set()
        # try up to 10 times, with a gentle exponential backoff (1.4-1.6x)
        while True:
            resp, _ = cls._request(
                None, url, "PATCH", data=tag_mutation_data, return_raw_resp=True
            )
            status_codes_seen.add(resp.status_code)
            # happy path
            if resp.status_code < 300:
                return frozenset(resp.json()["tags"])
            # definitely NOT retriable
            if resp.status_code in (400, 422):
                raise MetaflowTaggingError("Metadata service says: %s" % (resp.text,))
            # if we get here, mutation failure is possibly retriable
            if tries >= 10:
                # if we ever received 409 on any of our attempts, report "conflicting updates" blurb to user
                if 409 in status_codes_seen:
                    raise MetaflowTaggingError(
                        "Tagging failed due to too many conflicting updates from other processes"
                    )
                # No 409's seen... raise a more generic error
                raise MetaflowTaggingError("Tagging failed after %d tries" % tries)
            time.sleep(0.3 * random.uniform(1.4, 1.6) ** tries)
            tries += 1

    @classmethod
    def _get_object_internal(
        cls, obj_type, obj_order, sub_type, sub_order, filters, attempt, *args
    ):
        if attempt is not None:
            if cls._supports_attempt_gets is None:
                version = cls._version(None)
                cls._supports_attempt_gets = version is not None and LooseVersion(
                    version
                ) >= LooseVersion("2.0.6")
            if not cls._supports_attempt_gets:
                raise ServiceException(
                    "Getting specific attempts of Tasks or Artifacts requires "
                    "the metaflow service to be at least version 2.0.6. Please "
                    "upgrade your service"
                )

        if sub_type == "self":
            if obj_type == "artifact":
                # Special case with the artifacts; we add the attempt
                url = ServiceMetadataProvider._obj_path(
                    *args[:obj_order], attempt=attempt
                )
            else:
                url = ServiceMetadataProvider._obj_path(*args[:obj_order])
            try:
                v, _ = cls._request(None, url, "GET")
                return MetadataProvider._apply_filter([v], filters)[0]
            except ServiceException as ex:
                if ex.http_code == 404:
                    return None
                raise

        # For the other types, we locate all the objects we need to find and return them
        if obj_type != "root":
            url = ServiceMetadataProvider._obj_path(*args[:obj_order])
        else:
            url = ""
        if sub_type == "metadata":
            url += "/metadata"
        elif sub_type == "artifact" and obj_type == "task" and attempt is not None:
            url += "/attempt/%s/artifacts" % attempt
        else:
            url += "/%ss" % sub_type
        try:
            v, _ = cls._request(None, url, "GET")
            return MetadataProvider._apply_filter(v, filters)
        except ServiceException as ex:
            if ex.http_code == 404:
                return None
            raise

    def _new_run(self, run_id=None, tags=None, sys_tags=None):
        # first ensure that the flow exists
        self._get_or_create("flow")
        run, did_create = self._get_or_create(
            "run", run_id, tags=tags, sys_tags=sys_tags
        )
        return str(run["run_number"]), did_create

    def _new_task(
        self, run_id, step_name, task_id=None, attempt=0, tags=None, sys_tags=None
    ):
        # first ensure that the step exists
        self._get_or_create("step", run_id, step_name)
        task, did_create = self._get_or_create(
            "task", run_id, step_name, task_id, tags=tags, sys_tags=sys_tags
        )
        if did_create:
            self._register_system_metadata(run_id, step_name, task["task_id"], attempt)
        return task["task_id"], did_create

    @staticmethod
    def _obj_path(
        flow_name,
        run_id=None,
        step_name=None,
        task_id=None,
        artifact_name=None,
        attempt=None,
    ):
        object_path = "/flows/%s" % flow_name
        if run_id is not None:
            object_path += "/runs/%s" % run_id
        if step_name is not None:
            object_path += "/steps/%s" % step_name
        if task_id is not None:
            object_path += "/tasks/%s" % task_id
        if artifact_name is not None:
            object_path += "/artifacts/%s" % artifact_name
        if attempt is not None:
            object_path += "/attempt/%s" % attempt
        return object_path

    @staticmethod
    def _create_path(obj_type, flow_name, run_id=None, step_name=None):
        create_path = "/flows/%s" % flow_name
        if obj_type == "flow":
            return create_path
        if obj_type == "run":
            return create_path + "/run"
        create_path += "/runs/%s/steps/%s" % (run_id, step_name)
        if obj_type == "step":
            return create_path + "/step"
        return create_path + "/task"

    def _get_or_create(
        self,
        obj_type,
        run_id=None,
        step_name=None,
        task_id=None,
        tags=None,
        sys_tags=None,
    ):

        if tags is None:
            tags = set()
        if sys_tags is None:
            sys_tags = set()

        def create_object():
            data = self._object_to_json(
                obj_type,
                run_id,
                step_name,
                task_id,
                self.sticky_tags.union(tags),
                self.sticky_sys_tags.union(sys_tags),
            )
            return self._request(
                self._monitor, create_path, "POST", data=data, retry_409_path=obj_path
            )

        always_create = False
        obj_path = self._obj_path(self._flow_name, run_id, step_name, task_id)
        create_path = self._create_path(obj_type, self._flow_name, run_id, step_name)
        if obj_type == "run" and run_id is None:
            always_create = True
        elif obj_type == "task" and task_id is None:
            always_create = True

        if always_create:
            return create_object()

        try:
            return self._request(self._monitor, obj_path, "GET")
        except ServiceException as ex:
            if ex.http_code == 404:
                return create_object()
            else:
                raise

    # TODO _request() needs a more deliberate refactor at some point, it looks quite overgrown.
    @classmethod
    def _request(
        cls,
        monitor,
        path,
        method,
        data=None,
        retry_409_path=None,
        return_raw_resp=False,
    ):
        if cls.INFO is None:
            raise MetaflowException(
                "Missing Metaflow Service URL. "
                "Specify with METAFLOW_SERVICE_URL environment variable"
            )
        supported_methods = ("GET", "PATCH", "POST")
        if method not in supported_methods:
            raise MetaflowException(
                "Only these methods are supported: %s, but got %s"
                % (supported_methods, method)
            )
        url = os.path.join(cls.INFO, path.lstrip("/"))
        for i in range(METADATA_SERVICE_NUM_RETRIES):
            try:
                if method == "GET":
                    if monitor:
                        with monitor.measure("metaflow.service_metadata.get"):
                            resp = requests.get(url, headers=METADATA_SERVICE_HEADERS)
                    else:
                        resp = requests.get(url, headers=METADATA_SERVICE_HEADERS)
                elif method == "POST":
                    if monitor:
                        with monitor.measure("metaflow.service_metadata.post"):
                            resp = requests.post(
                                url, headers=METADATA_SERVICE_HEADERS, json=data
                            )
                    else:
                        resp = requests.post(
                            url, headers=METADATA_SERVICE_HEADERS, json=data
                        )
                elif method == "PATCH":
                    if monitor:
                        with monitor.measure("metaflow.service_metadata.patch"):
                            resp = requests.patch(
                                url, headers=METADATA_SERVICE_HEADERS, json=data
                            )
                    else:
                        resp = requests.patch(
                            url, headers=METADATA_SERVICE_HEADERS, json=data
                        )
                else:
                    raise MetaflowInternalError("Unexpected HTTP method %s" % (method,))
            except MetaflowInternalError:
                raise
            except:  # noqa E722
                if monitor:
                    with monitor.count("metaflow.service_metadata.failed_request"):
                        if i == METADATA_SERVICE_NUM_RETRIES - 1:
                            raise
                else:
                    if i == METADATA_SERVICE_NUM_RETRIES - 1:
                        raise
                resp = None
            else:
                if return_raw_resp:
                    return resp, True
                if resp.status_code < 300:
                    return resp.json(), True
                elif resp.status_code == 409 and data is not None:
                    # a special case: the post fails due to a conflict
                    # this could occur when we missed a success response
                    # from the first POST request but the request
                    # actually went though, so a subsequent POST
                    # returns 409 (conflict) or we end up with a
                    # conflict while running on AWS Step Functions
                    # instead of retrying the post we retry with a get since
                    # the record is guaranteed to exist
                    if retry_409_path:
                        v, _ = cls._request(monitor, retry_409_path, "GET")
                        return v, False
                    else:
                        return None, False
                elif resp.status_code != 503:
                    raise ServiceException(
                        "Metadata request (%s) failed (code %s): %s"
                        % (path, resp.status_code, resp.text),
                        resp.status_code,
                        resp.text,
                    )
            time.sleep(2 ** i)

        if resp:
            raise ServiceException(
                "Metadata request (%s) failed (code %s): %s"
                % (path, resp.status_code, resp.text),
                resp.status_code,
                resp.text,
            )
        else:
            raise ServiceException("Metadata request (%s) failed" % path)

    @classmethod
    def _version(cls, monitor):
        if cls.INFO is None:
            raise MetaflowException(
                "Missing Metaflow Service URL. "
                "Specify with METAFLOW_SERVICE_URL environment variable"
            )
        path = "ping"
        url = os.path.join(cls.INFO, path)
        for i in range(METADATA_SERVICE_NUM_RETRIES):
            try:
                if monitor:
                    with monitor.measure("metaflow.service_metadata.get"):
                        resp = requests.get(url, headers=METADATA_SERVICE_HEADERS)
                else:
                    resp = requests.get(url, headers=METADATA_SERVICE_HEADERS)
            except:
                if monitor:
                    with monitor.count("metaflow.service_metadata.failed_request"):
                        if i == METADATA_SERVICE_NUM_RETRIES - 1:
                            raise
                else:
                    if i == METADATA_SERVICE_NUM_RETRIES - 1:
                        raise
                resp = None
            else:
                if resp.status_code < 300:
                    return resp.headers.get("METADATA_SERVICE_VERSION", None)
                elif resp.status_code != 503:
                    raise ServiceException(
                        "Metadata request (%s) failed"
                        " (code %s): %s" % (url, resp.status_code, resp.text),
                        resp.status_code,
                        resp.text,
                    )
            time.sleep(2 ** i)
        if resp:
            raise ServiceException(
                "Metadata request (%s) failed (code %s): %s"
                % (url, resp.status_code, resp.text),
                resp.status_code,
                resp.text,
            )
        else:
            raise ServiceException("Metadata request (%s) failed" % url)
Beispiel #8
0
class BatchDecorator(StepDecorator):
    """
    Step decorator to specify that this step should execute on AWS Batch.

    This decorator indicates that your step should execute on AWS Batch. Note
    that you can apply this decorator automatically to all steps using the
    ```--with batch``` argument when calling run/resume. Step level decorators
    within the code are overrides and will force a step to execute on AWS Batch
    regardless of the ```--with``` specification.

    To use, annotate your step as follows:
    ```
    @batch
    @step
    def my_step(self):
        ...
    ```
    Parameters
    ----------
    cpu : int
        Number of CPUs required for this step. Defaults to 1. If @resources is
        also present, the maximum value from all decorators is used
    gpu : int
        Number of GPUs required for this step. Defaults to 0. If @resources is
        also present, the maximum value from all decorators is used
    memory : int
        Memory size (in MB) required for this step. Defaults to 4096. If
        @resources is also present, the maximum value from all decorators is
        used
    image : string
        Docker image to use when launching on AWS Batch. If not specified, a
        default docker image mapping to the current version of Python is used
    queue : string
        AWS Batch Job Queue to submit the job to. Defaults to the one
        specified by the environment variable METAFLOW_BATCH_JOB_QUEUE
    iam_role : string
        AWS IAM role that AWS Batch container uses to access AWS cloud resources
        (Amazon S3, Amazon DynamoDb, etc). Defaults to the one specified by the
        environment variable METAFLOW_ECS_S3_ACCESS_IAM_ROLE
    execution_role : string
        AWS IAM role that AWS Batch can use to trigger AWS Fargate tasks.
        Defaults to the one determined by the environment variable
        METAFLOW_ECS_FARGATE_EXECUTION_ROLE https://docs.aws.amazon.com/batch/latest/userguide/execution-IAM-role.html
    shared_memory : int
        The value for the size (in MiB) of the /dev/shm volume for this step.
        This parameter maps to the --shm-size option to docker run.
    max_swap : int
        The total amount of swap memory (in MiB) a container can use for this
        step. This parameter is translated to the --memory-swap option to
        docker run where the value is the sum of the container memory plus the
        max_swap value.
    swappiness : int
        This allows you to tune memory swappiness behavior for this step.
        A swappiness value of 0 causes swapping not to happen unless absolutely
        necessary. A swappiness value of 100 causes pages to be swapped very
        aggressively. Accepted values are whole numbers between 0 and 100.
    """

    name = "batch"
    defaults = {
        "cpu": None,
        "gpu": None,
        "memory": None,
        "image": None,
        "queue": BATCH_JOB_QUEUE,
        "iam_role": ECS_S3_ACCESS_IAM_ROLE,
        "execution_role": ECS_FARGATE_EXECUTION_ROLE,
        "shared_memory": None,
        "max_swap": None,
        "swappiness": None,
        "host_volumes": None,
    }
    resource_defaults = {
        "cpu": "1",
        "gpu": "0",
        "memory": "4096",
    }
    package_url = None
    package_sha = None
    run_time_limit = None

    def __init__(self, attributes=None, statically_defined=False):
        super(BatchDecorator, self).__init__(attributes, statically_defined)

        # If no docker image is explicitly specified, impute a default image.
        if not self.attributes["image"]:
            # If metaflow-config specifies a docker image, just use that.
            if BATCH_CONTAINER_IMAGE:
                self.attributes["image"] = BATCH_CONTAINER_IMAGE
            # If metaflow-config doesn't specify a docker image, assign a
            # default docker image.
            else:
                # Metaflow-R has it's own default docker image (rocker family)
                if R.use_r():
                    self.attributes["image"] = R.container_image()
                # Default to vanilla Python image corresponding to major.minor
                # version of the Python interpreter launching the flow.
                else:
                    self.attributes["image"] = "python:%s.%s" % (
                        platform.python_version_tuple()[0],
                        platform.python_version_tuple()[1],
                    )
        # Assign docker registry URL for the image.
        if not get_docker_registry(self.attributes["image"]):
            if BATCH_CONTAINER_REGISTRY:
                self.attributes["image"] = "%s/%s" % (
                    BATCH_CONTAINER_REGISTRY.rstrip("/"),
                    self.attributes["image"],
                )

    # Refer https://github.com/Netflix/metaflow/blob/master/docs/lifecycle.png
    # to understand where these functions are invoked in the lifecycle of a
    # Metaflow flow.
    def step_init(self, flow, graph, step, decos, environment, flow_datastore,
                  logger):
        if flow_datastore.TYPE != "s3":
            raise BatchException(
                "The *@batch* decorator requires --datastore=s3.")

        # Set internal state.
        self.logger = logger
        self.environment = environment
        self.step = step
        self.flow_datastore = flow_datastore

        self.attributes.update(
            compute_resource_attributes(decos, self, self.resource_defaults))

        # Set run time limit for the AWS Batch job.
        self.run_time_limit = get_run_time_limit_for_task(decos)
        if self.run_time_limit < 60:
            raise BatchException(
                "The timeout for step *{step}* should be at "
                "least 60 seconds for execution on AWS Batch.".format(
                    step=step))

    def runtime_init(self, flow, graph, package, run_id):
        # Set some more internal state.
        self.flow = flow
        self.graph = graph
        self.package = package
        self.run_id = run_id

    def runtime_task_created(self, task_datastore, task_id, split_index,
                             input_paths, is_cloned, ubf_context):
        if not is_cloned:
            self._save_package_once(self.flow_datastore, self.package)

    def runtime_step_cli(self, cli_args, retry_count, max_user_code_retries,
                         ubf_context):
        if retry_count <= max_user_code_retries:
            # after all attempts to run the user code have failed, we don't need
            # to execute on AWS Batch anymore. We can execute possible fallback
            # code locally.
            cli_args.commands = ["batch", "step"]
            cli_args.command_args.append(self.package_sha)
            cli_args.command_args.append(self.package_url)
            cli_args.command_options.update(self.attributes)
            cli_args.command_options["run-time-limit"] = self.run_time_limit
            if not R.use_r():
                cli_args.entrypoint[0] = sys.executable

    def task_pre_step(
        self,
        step_name,
        task_datastore,
        metadata,
        run_id,
        task_id,
        flow,
        graph,
        retry_count,
        max_retries,
        ubf_context,
        inputs,
    ):
        self.metadata = metadata
        self.task_datastore = task_datastore

        # task_pre_step may run locally if fallback is activated for @catch
        # decorator. In that scenario, we skip collecting AWS Batch execution
        # metadata. A rudimentary way to detect non-local execution is to
        # check for the existence of AWS_BATCH_JOB_ID environment variable.

        if "AWS_BATCH_JOB_ID" in os.environ:
            meta = {}
            meta["aws-batch-job-id"] = os.environ["AWS_BATCH_JOB_ID"]
            meta["aws-batch-job-attempt"] = os.environ["AWS_BATCH_JOB_ATTEMPT"]
            meta["aws-batch-ce-name"] = os.environ["AWS_BATCH_CE_NAME"]
            meta["aws-batch-jq-name"] = os.environ["AWS_BATCH_JQ_NAME"]
            meta["aws-batch-execution-env"] = os.environ["AWS_EXECUTION_ENV"]

            # Capture AWS Logs metadata. This is best effort only since
            # only V4 of the metadata uri for the ECS container hosts this
            # information and it is quite likely that not all consumers of
            # Metaflow would be running the container agent compatible with
            # version V4.
            # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-metadata-endpoint.html
            try:
                logs_meta = (requests.get(
                    url=os.environ["ECS_CONTAINER_METADATA_URI_V4"]).json().
                             get("LogOptions", {}))
                meta["aws-batch-awslogs-group"] = logs_meta.get(
                    "awslogs-group")
                meta["aws-batch-awslogs-region"] = logs_meta.get(
                    "awslogs-region")
                meta["aws-batch-awslogs-stream"] = logs_meta.get(
                    "awslogs-stream")
            except:
                pass

            entries = [
                MetaDatum(
                    field=k,
                    value=v,
                    type=k,
                    tags=["attempt_id:{0}".format(retry_count)],
                ) for k, v in meta.items()
            ]
            # Register book-keeping metadata for debugging.
            metadata.register_metadata(run_id, step_name, task_id, entries)

            self._save_logs_sidecar = Sidecar("save_logs_periodically")
            self._save_logs_sidecar.start()

        num_parallel = int(os.environ.get("AWS_BATCH_JOB_NUM_NODES", 0))
        if num_parallel >= 1 and ubf_context == UBF_CONTROL:
            # UBF handling for multinode case
            control_task_id = current.task_id
            top_task_id = control_task_id.replace("control-", "")  # chop "-0"
            mapper_task_ids = [control_task_id] + [
                "%s-node-%d" % (top_task_id, node_idx)
                for node_idx in range(1, num_parallel)
            ]
            flow._control_mapper_tasks = [
                "%s/%s/%s" % (run_id, step_name, mapper_task_id)
                for mapper_task_id in mapper_task_ids
            ]
            flow._control_task_is_mapper_zero = True

        if num_parallel >= 1:
            _setup_multinode_environment()

    def task_finished(self, step_name, flow, graph, is_task_ok, retry_count,
                      max_retries):

        # task_finished may run locally if fallback is activated for @catch
        # decorator.
        if "AWS_BATCH_JOB_ID" in os.environ:
            # If `local` metadata is configured, we would need to copy task
            # execution metadata from the AWS Batch container to user's
            # local file system after the user code has finished execution.
            # This happens via datastore as a communication bridge.
            if self.metadata.TYPE == "local":
                # Note that the datastore is *always* Amazon S3 (see
                # runtime_task_created function).
                sync_local_metadata_to_datastore(DATASTORE_LOCAL_DIR,
                                                 self.task_datastore)

        try:
            self._save_logs_sidecar.terminate()
        except:
            # Best effort kill
            pass

        if is_task_ok and len(getattr(flow, "_control_mapper_tasks", [])) > 1:
            self._wait_for_mapper_tasks(flow, step_name)

    def _wait_for_mapper_tasks(self, flow, step_name):
        """
        When lauching multinode task with UBF, need to wait for the secondary
        tasks to finish cleanly and produce their output before exiting the
        main task. Otherwise main task finishing will cause secondary nodes
        to terminate immediately, and possibly prematurely.
        """
        from metaflow import Step  # avoid circular dependency

        TIMEOUT = 600
        last_completion_timeout = time.time() + TIMEOUT
        print("Waiting for batch secondary tasks to finish")
        while last_completion_timeout > time.time():
            time.sleep(2)
            try:
                step_path = "%s/%s/%s" % (flow.name, current.run_id, step_name)
                tasks = [task for task in Step(step_path)]
                if len(tasks) == len(flow._control_mapper_tasks):
                    if all(task.finished_at is not None for task in
                           tasks):  # for some reason task.finished fails
                        return True
                else:
                    print(
                        "Waiting for all parallel tasks to finish. Finished: {}/{}"
                        .format(
                            len(tasks),
                            len(flow._control_mapper_tasks),
                        ))
            except Exception as e:
                pass
        raise Exception(
            "Batch secondary workers did not finish in %s seconds" % TIMEOUT)

    @classmethod
    def _save_package_once(cls, flow_datastore, package):
        if cls.package_url is None:
            cls.package_url, cls.package_sha = flow_datastore.save_data(
                [package.blob], len_hint=1)[0]
Beispiel #9
0
    def task_pre_step(
        self,
        step_name,
        task_datastore,
        metadata,
        run_id,
        task_id,
        flow,
        graph,
        retry_count,
        max_retries,
        ubf_context,
        inputs,
    ):
        self.metadata = metadata
        self.task_datastore = task_datastore

        # task_pre_step may run locally if fallback is activated for @catch
        # decorator. In that scenario, we skip collecting AWS Batch execution
        # metadata. A rudimentary way to detect non-local execution is to
        # check for the existence of AWS_BATCH_JOB_ID environment variable.

        if "AWS_BATCH_JOB_ID" in os.environ:
            meta = {}
            meta["aws-batch-job-id"] = os.environ["AWS_BATCH_JOB_ID"]
            meta["aws-batch-job-attempt"] = os.environ["AWS_BATCH_JOB_ATTEMPT"]
            meta["aws-batch-ce-name"] = os.environ["AWS_BATCH_CE_NAME"]
            meta["aws-batch-jq-name"] = os.environ["AWS_BATCH_JQ_NAME"]
            meta["aws-batch-execution-env"] = os.environ["AWS_EXECUTION_ENV"]

            # Capture AWS Logs metadata. This is best effort only since
            # only V4 of the metadata uri for the ECS container hosts this
            # information and it is quite likely that not all consumers of
            # Metaflow would be running the container agent compatible with
            # version V4.
            # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-metadata-endpoint.html
            try:
                logs_meta = (requests.get(
                    url=os.environ["ECS_CONTAINER_METADATA_URI_V4"]).json().
                             get("LogOptions", {}))
                meta["aws-batch-awslogs-group"] = logs_meta.get(
                    "awslogs-group")
                meta["aws-batch-awslogs-region"] = logs_meta.get(
                    "awslogs-region")
                meta["aws-batch-awslogs-stream"] = logs_meta.get(
                    "awslogs-stream")
            except:
                pass

            entries = [
                MetaDatum(
                    field=k,
                    value=v,
                    type=k,
                    tags=["attempt_id:{0}".format(retry_count)],
                ) for k, v in meta.items()
            ]
            # Register book-keeping metadata for debugging.
            metadata.register_metadata(run_id, step_name, task_id, entries)

            self._save_logs_sidecar = Sidecar("save_logs_periodically")
            self._save_logs_sidecar.start()

        num_parallel = int(os.environ.get("AWS_BATCH_JOB_NUM_NODES", 0))
        if num_parallel >= 1 and ubf_context == UBF_CONTROL:
            # UBF handling for multinode case
            control_task_id = current.task_id
            top_task_id = control_task_id.replace("control-", "")  # chop "-0"
            mapper_task_ids = [control_task_id] + [
                "%s-node-%d" % (top_task_id, node_idx)
                for node_idx in range(1, num_parallel)
            ]
            flow._control_mapper_tasks = [
                "%s/%s/%s" % (run_id, step_name, mapper_task_id)
                for mapper_task_id in mapper_task_ids
            ]
            flow._control_task_is_mapper_zero = True

        if num_parallel >= 1:
            _setup_multinode_environment()