class NullMonitor(object): TYPE = "nullSidecarMonitor" def __init__(self, *args, **kwargs): # Currently passed flow and env as kwargs self._sidecar = Sidecar(self.TYPE) def start(self): return self._sidecar.start() def terminate(self): return self._sidecar.terminate() def send(self, msg): # Arbitrary message sending. Useful if you want to override some different # types of messages. self._sidecar.send(msg) @contextmanager def count(self, name): if self._sidecar.is_active: counter = Counter(name) counter.increment() payload = {"counter": counter.serialize()} msg = Message(MessageTypes.BEST_EFFORT, payload) yield self._sidecar.send(msg) else: yield @contextmanager def measure(self, name): if self._sidecar.is_active: timer = Timer(name + "_timer") counter = Counter(name + "_counter") timer.start() counter.increment() yield timer.end() payload = { "counter": counter.serialize(), "timer": timer.serialize() } msg = Message(MessageTypes.BEST_EFFORT, payload) self._sidecar.send(msg) else: yield def gauge(self, gauge): if self._sidecar.is_active: payload = {"gauge": gauge.serialize()} msg = Message(MessageTypes.BEST_EFFORT, payload) self._sidecar.send(msg) @classmethod def get_worker(cls): return None
def task_pre_step( self, step_name, task_datastore, metadata, run_id, task_id, flow, graph, retry_count, max_retries, ubf_context, inputs, ): self.metadata = metadata self.task_datastore = task_datastore # task_pre_step may run locally if fallback is activated for @catch # decorator. In that scenario, we skip collecting Kubernetes execution # metadata. A rudimentary way to detect non-local execution is to # check for the existence of METAFLOW_KUBERNETES_WORKLOAD environment # variable. if "METAFLOW_KUBERNETES_WORKLOAD" in os.environ: meta = {} meta["kubernetes-pod-name"] = os.environ[ "METAFLOW_KUBERNETES_POD_NAME"] meta["kubernetes-pod-namespace"] = os.environ[ "METAFLOW_KUBERNETES_POD_NAMESPACE"] meta["kubernetes-pod-id"] = os.environ[ "METAFLOW_KUBERNETES_POD_ID"] meta["kubernetes-pod-service-account-name"] = os.environ[ "METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME"] # Unfortunately, there doesn't seem to be any straight forward way right # now to attach the Batch/v1 name - While we can rely on a hacky approach # given we know that the pod name is simply a unique suffix with a hyphen # delimiter to the Batch/v1 name - this approach will fail if the Batch/v1 # name is closer to 63 chars where the pod name will truncate the Batch/v1 # name. # if "ARGO_WORKFLOW_NAME" not in os.environ: # meta["kubernetes-job-name"] = os.environ[ # "METAFLOW_KUBERNETES_POD_NAME" # ].rpartition("-")[0] entries = [ MetaDatum(field=k, value=v, type=k, tags=[]) for k, v in meta.items() ] # Register book-keeping metadata for debugging. metadata.register_metadata(run_id, step_name, task_id, entries) # Start MFLog sidecar to collect task logs. self._save_logs_sidecar = Sidecar("save_logs_periodically") self._save_logs_sidecar.start()
class NullEventLogger(object): TYPE = "nullSidecarLogger" def __init__(self, *args, **kwargs): # Currently passed flow and env in kwargs self._sidecar = Sidecar(self.TYPE) def start(self): return self._sidecar.start() def terminate(self): return self._sidecar.terminate() def send(self, msg): # Arbitrary message sending. Useful if you want to override some different # types of messages. self._sidecar.send(msg) def log(self, payload): if self._sidecar.is_active: msg = Message(MessageTypes.BEST_EFFORT, payload) self._sidecar.send(msg) @classmethod def get_worker(cls): return None
def _start_heartbeat( self, heartbeat_type, flow_id, run_id, step_name=None, task_id=None ): if self._already_started(): # A single ServiceMetadataProvider instance can not start # multiple heartbeat side cars of any type/combination. Either a # single run heartbeat or a single task heartbeat can be started raise Exception("heartbeat already started") # create init message payload = {} if heartbeat_type == HeartbeatTypes.TASK: # create task heartbeat data = { "flow_id": flow_id, "run_number": run_id, "step_name": step_name, "task_id": task_id, } payload[HB_URL_KEY] = self.url_task_template.format(**data) elif heartbeat_type == HeartbeatTypes.RUN: # create run heartbeat data = {"flow_id": flow_id, "run_number": run_id} payload[HB_URL_KEY] = self.url_run_template.format(**data) else: raise Exception("invalid heartbeat type") payload["service_version"] = self.version() # start sidecar if self.version() is None or LooseVersion(self.version()) < LooseVersion( "2.0.4" ): # if old version of the service is running # then avoid running real heartbeat sidecar process self.sidecar = Sidecar("none") else: self.sidecar = Sidecar("heartbeat") self.sidecar.start() self.sidecar.send(Message(MessageTypes.BEST_EFFORT, payload))
def __init__(self, *args, **kwargs): # Currently passed flow and env as kwargs self._sidecar = Sidecar(self.TYPE)
class KubernetesDecorator(StepDecorator): """ Step decorator to specify that this step should execute on Kubernetes. This decorator indicates that your step should execute on Kubernetes. Note that you can apply this decorator automatically to all steps using the ```--with kubernetes``` argument when calling run/resume. Step level decorators within the code are overrides and will force a step to execute on Kubernetes regardless of the ```--with``` specification. To use, annotate your step as follows: ``` @kubernetes @step def my_step(self): ... ``` Parameters ---------- cpu : int Number of CPUs required for this step. Defaults to 1. If @resources is also present, the maximum value from all decorators is used memory : int Memory size (in MB) required for this step. Defaults to 4096. If @resources is also present, the maximum value from all decorators is used disk : int Disk size (in MB) required for this step. Defaults to 10GB. If @resources is also present, the maximum value from all decorators is used image : string Docker image to use when launching on Kubernetes. If not specified, a default docker image mapping to the current version of Python is used """ name = "kubernetes" defaults = { "cpu": "1", "memory": "4096", "disk": "10240", "image": None, "service_account": None, "secrets": None, # e.g., mysecret "node_selector": None, # e.g., kubernetes.io/os=linux "namespace": None, "gpu": None, # value of 0 implies that the scheduled node should not have GPUs "gpu_vendor": None, } package_url = None package_sha = None run_time_limit = None def __init__(self, attributes=None, statically_defined=False): super(KubernetesDecorator, self).__init__(attributes, statically_defined) if not self.attributes["namespace"]: self.attributes["namespace"] = KUBERNETES_NAMESPACE if not self.attributes["service_account"]: self.attributes["service_account"] = KUBERNETES_SERVICE_ACCOUNT if not self.attributes["gpu_vendor"]: self.attributes["gpu_vendor"] = KUBERNETES_GPU_VENDOR # TODO: Handle node_selector in a better manner. Currently it is special # cased in kubernetes_client.py # If no docker image is explicitly specified, impute a default image. if not self.attributes["image"]: # If metaflow-config specifies a docker image, just use that. if KUBERNETES_CONTAINER_IMAGE: self.attributes["image"] = KUBERNETES_CONTAINER_IMAGE # If metaflow-config doesn't specify a docker image, assign a # default docker image. else: # Default to vanilla Python image corresponding to major.minor # version of the Python interpreter launching the flow. self.attributes["image"] = "python:%s.%s" % ( platform.python_version_tuple()[0], platform.python_version_tuple()[1], ) # Assign docker registry URL for the image. if not get_docker_registry(self.attributes["image"]): if KUBERNETES_CONTAINER_REGISTRY: self.attributes["image"] = "%s/%s" % ( KUBERNETES_CONTAINER_REGISTRY.rstrip("/"), self.attributes["image"], ) # Refer https://github.com/Netflix/metaflow/blob/master/docs/lifecycle.png def step_init(self, flow, graph, step, decos, environment, flow_datastore, logger): # Executing Kubernetes jobs requires a non-local datastore. if flow_datastore.TYPE != "s3": raise KubernetesException( "The *@kubernetes* decorator requires --datastore=s3 at the moment." ) # Set internal state. self.logger = logger self.environment = environment self.step = step self.flow_datastore = flow_datastore if any([deco.name == "batch" for deco in decos]): raise MetaflowException( "Step *{step}* is marked for execution both on AWS Batch and " "Kubernetes. Please use one or the other.".format(step=step)) for deco in decos: if getattr(deco, "IS_PARALLEL", False): raise KubernetesException( "@kubernetes does not support parallel execution currently." ) # Set run time limit for the Kubernetes job. self.run_time_limit = get_run_time_limit_for_task(decos) if self.run_time_limit < 60: raise KubernetesException( "The timeout for step *{step}* should be at least 60 seconds for " "execution on Kubernetes.".format(step=step)) for deco in decos: if isinstance(deco, ResourcesDecorator): for k, v in deco.attributes.items(): # TODO: Special case GPUs when they are introduced in @resources. if k in self.attributes: if self.defaults[k] is None: # skip if expected value isn't an int/float continue # We use the larger of @resources and @batch attributes # TODO: Fix https://github.com/Netflix/metaflow/issues/467 my_val = self.attributes.get(k) if not (my_val is None and v is None): self.attributes[k] = str( max(float(my_val or 0), float(v or 0))) # Check GPU vendor. if self.attributes["gpu_vendor"].lower() not in ("amd", "nvidia"): raise KubernetesException( "GPU vendor *{}* for step *{step}* is not currently supported." .format(self.attributes["gpu_vendor"], step=step)) # CPU, Disk, and Memory values should be greater than 0. for attr in ["cpu", "disk", "memory"]: if not (isinstance(self.attributes[attr], (int, unicode, basestring, float)) and float(self.attributes[attr]) > 0): raise KubernetesException( "Invalid {} value *{}* for step *{step}*; it should be greater than 0" .format(attr, self.attributes[attr], step=step)) if self.attributes["gpu"] is not None and not ( isinstance(self.attributes["gpu"], (int, unicode, basestring)) and float(self.attributes["gpu"]).is_integer()): raise KubernetesException( "Invalid GPU value *{}* for step *{step}*; it should be an integer" .format(self.attributes["gpu"], step=step)) def package_init(self, flow, step_name, environment): try: # Kubernetes is a soft dependency. from kubernetes import client, config except (NameError, ImportError): raise KubernetesException( "Could not import module 'kubernetes'.\n\nInstall Kubernetes " "Python package (https://pypi.org/project/kubernetes/) first.\n" "You can install the module by executing - " "%s -m pip install kubernetes\n" "or equivalent through your favorite Python package manager." % sys.executable) def runtime_init(self, flow, graph, package, run_id): # Set some more internal state. self.flow = flow self.graph = graph self.package = package self.run_id = run_id def runtime_task_created(self, task_datastore, task_id, split_index, input_paths, is_cloned, ubf_context): # To execute the Kubernetes job, the job container needs to have # access to the code package. We store the package in the datastore # which the pod is able to download as part of it's entrypoint. if not is_cloned: self._save_package_once(self.flow_datastore, self.package) def runtime_step_cli(self, cli_args, retry_count, max_user_code_retries, ubf_context): if retry_count <= max_user_code_retries: # After all attempts to run the user code have failed, we don't need # to execute on Kubernetes anymore. We can execute possible fallback # code locally. cli_args.commands = ["kubernetes", "step"] cli_args.command_args.append(self.package_sha) cli_args.command_args.append(self.package_url) # --namespace is used to specify Metaflow namespace (a different # concept from k8s namespace). for k, v in self.attributes.items(): if k == "namespace": cli_args.command_options["k8s_namespace"] = v else: cli_args.command_options[k] = v cli_args.command_options["run-time-limit"] = self.run_time_limit cli_args.entrypoint[0] = sys.executable def task_pre_step( self, step_name, task_datastore, metadata, run_id, task_id, flow, graph, retry_count, max_retries, ubf_context, inputs, ): self.metadata = metadata self.task_datastore = task_datastore # task_pre_step may run locally if fallback is activated for @catch # decorator. In that scenario, we skip collecting Kubernetes execution # metadata. A rudimentary way to detect non-local execution is to # check for the existence of METAFLOW_KUBERNETES_WORKLOAD environment # variable. if "METAFLOW_KUBERNETES_WORKLOAD" in os.environ: meta = {} meta["kubernetes-pod-name"] = os.environ[ "METAFLOW_KUBERNETES_POD_NAME"] meta["kubernetes-pod-namespace"] = os.environ[ "METAFLOW_KUBERNETES_POD_NAMESPACE"] meta["kubernetes-pod-id"] = os.environ[ "METAFLOW_KUBERNETES_POD_ID"] meta["kubernetes-pod-service-account-name"] = os.environ[ "METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME"] # Unfortunately, there doesn't seem to be any straight forward way right # now to attach the Batch/v1 name - While we can rely on a hacky approach # given we know that the pod name is simply a unique suffix with a hyphen # delimiter to the Batch/v1 name - this approach will fail if the Batch/v1 # name is closer to 63 chars where the pod name will truncate the Batch/v1 # name. # if "ARGO_WORKFLOW_NAME" not in os.environ: # meta["kubernetes-job-name"] = os.environ[ # "METAFLOW_KUBERNETES_POD_NAME" # ].rpartition("-")[0] entries = [ MetaDatum(field=k, value=v, type=k, tags=[]) for k, v in meta.items() ] # Register book-keeping metadata for debugging. metadata.register_metadata(run_id, step_name, task_id, entries) # Start MFLog sidecar to collect task logs. self._save_logs_sidecar = Sidecar("save_logs_periodically") self._save_logs_sidecar.start() def task_finished(self, step_name, flow, graph, is_task_ok, retry_count, max_retries): # task_finished may run locally if fallback is activated for @catch # decorator. if "METAFLOW_KUBERNETES_WORKLOAD" in os.environ: # If `local` metadata is configured, we would need to copy task # execution metadata from the AWS Batch container to user's # local file system after the user code has finished execution. # This happens via datastore as a communication bridge. # TODO: There is no guarantee that task_prestep executes before # task_finished is invoked. That will result in AttributeError: # 'KubernetesDecorator' object has no attribute 'metadata' error. if self.metadata.TYPE == "local": # Note that the datastore is *always* Amazon S3 (see # runtime_task_created function). sync_local_metadata_to_datastore(DATASTORE_LOCAL_DIR, self.task_datastore) try: self._save_logs_sidecar.terminate() except: # Best effort kill pass @classmethod def _save_package_once(cls, flow_datastore, package): if cls.package_url is None: cls.package_url, cls.package_sha = flow_datastore.save_data( [package.blob], len_hint=1)[0]
class ServiceMetadataProvider(MetadataProvider): TYPE = "service" _supports_attempt_gets = None _supports_tag_mutation = None def __init__(self, environment, flow, event_logger, monitor): super(ServiceMetadataProvider, self).__init__( environment, flow, event_logger, monitor ) self.url_task_template = os.path.join( METADATA_SERVICE_URL, "flows/{flow_id}/runs/{run_number}/steps/{step_name}/tasks/{task_id}/heartbeat", ) self.url_run_template = os.path.join( METADATA_SERVICE_URL, "flows/{flow_id}/runs/{run_number}/heartbeat" ) self.sidecar = None @classmethod def compute_info(cls, val): v = val.rstrip("/") try: resp = requests.get( os.path.join(v, "ping"), headers=METADATA_SERVICE_HEADERS ) resp.raise_for_status() except: # noqa E722 raise ValueError("Metaflow service [%s] unreachable." % v) return v @classmethod def default_info(cls): return METADATA_SERVICE_URL def version(self): return self._version(self._monitor) def new_run_id(self, tags=None, sys_tags=None): v, _ = self._new_run(tags=tags, sys_tags=sys_tags) return v def register_run_id(self, run_id, tags=None, sys_tags=None): try: # don't try to register an integer ID which was obtained # from the metadata service in the first place int(run_id) return False except ValueError: _, did_create = self._new_run(run_id, tags=tags, sys_tags=sys_tags) return did_create def new_task_id(self, run_id, step_name, tags=None, sys_tags=None): v, _ = self._new_task(run_id, step_name, tags=tags, sys_tags=sys_tags) return v def register_task_id( self, run_id, step_name, task_id, attempt=0, tags=None, sys_tags=None ): try: # don't try to register an integer ID which was obtained # from the metadata service in the first place int(task_id) except ValueError: _, did_create = self._new_task( run_id, step_name, task_id=task_id, attempt=attempt, tags=tags, sys_tags=sys_tags, ) return did_create else: self._register_system_metadata(run_id, step_name, task_id, attempt) return False def _start_heartbeat( self, heartbeat_type, flow_id, run_id, step_name=None, task_id=None ): if self._already_started(): # A single ServiceMetadataProvider instance can not start # multiple heartbeat side cars of any type/combination. Either a # single run heartbeat or a single task heartbeat can be started raise Exception("heartbeat already started") # create init message payload = {} if heartbeat_type == HeartbeatTypes.TASK: # create task heartbeat data = { "flow_id": flow_id, "run_number": run_id, "step_name": step_name, "task_id": task_id, } payload[HB_URL_KEY] = self.url_task_template.format(**data) elif heartbeat_type == HeartbeatTypes.RUN: # create run heartbeat data = {"flow_id": flow_id, "run_number": run_id} payload[HB_URL_KEY] = self.url_run_template.format(**data) else: raise Exception("invalid heartbeat type") payload["service_version"] = self.version() # start sidecar if self.version() is None or LooseVersion(self.version()) < LooseVersion( "2.0.4" ): # if old version of the service is running # then avoid running real heartbeat sidecar process self.sidecar = Sidecar("none") else: self.sidecar = Sidecar("heartbeat") self.sidecar.start() self.sidecar.send(Message(MessageTypes.BEST_EFFORT, payload)) def start_run_heartbeat(self, flow_id, run_id): self._start_heartbeat(HeartbeatTypes.RUN, flow_id, run_id) def start_task_heartbeat(self, flow_id, run_id, step_name, task_id): self._start_heartbeat(HeartbeatTypes.TASK, flow_id, run_id, step_name, task_id) def _already_started(self): return self.sidecar is not None def stop_heartbeat(self): self.sidecar.terminate() def register_data_artifacts( self, run_id, step_name, task_id, attempt_id, artifacts ): url = ServiceMetadataProvider._obj_path( self._flow_name, run_id, step_name, task_id ) url += "/artifact" data = self._artifacts_to_json( run_id, step_name, task_id, attempt_id, artifacts ) self._request(self._monitor, url, "POST", data) def register_metadata(self, run_id, step_name, task_id, metadata): url = ServiceMetadataProvider._obj_path( self._flow_name, run_id, step_name, task_id ) url += "/metadata" data = self._metadata_to_json(run_id, step_name, task_id, metadata) self._request(self._monitor, url, "POST", data) @classmethod def _mutate_user_tags_for_run( cls, flow_id, run_id, tags_to_add=None, tags_to_remove=None ): min_service_version_with_tag_mutation = "2.3.0" if cls._supports_tag_mutation is None: version = cls._version(None) cls._supports_tag_mutation = version is not None and LooseVersion( version ) >= LooseVersion(min_service_version_with_tag_mutation) if not cls._supports_tag_mutation: raise ServiceException( "Adding or removing tags on a run requires the Metaflow service to be " "at least version %s. Please upgrade your service." % (min_service_version_with_tag_mutation,) ) url = ServiceMetadataProvider._obj_path(flow_id, run_id) + "/tag/mutate" tag_mutation_data = { # mutate_user_tags_for_run() should have already ensured that this is a list, so let's be tolerant here "tags_to_add": list(tags_to_add or []), "tags_to_remove": list(tags_to_remove or []), } tries = 1 status_codes_seen = set() # try up to 10 times, with a gentle exponential backoff (1.4-1.6x) while True: resp, _ = cls._request( None, url, "PATCH", data=tag_mutation_data, return_raw_resp=True ) status_codes_seen.add(resp.status_code) # happy path if resp.status_code < 300: return frozenset(resp.json()["tags"]) # definitely NOT retriable if resp.status_code in (400, 422): raise MetaflowTaggingError("Metadata service says: %s" % (resp.text,)) # if we get here, mutation failure is possibly retriable if tries >= 10: # if we ever received 409 on any of our attempts, report "conflicting updates" blurb to user if 409 in status_codes_seen: raise MetaflowTaggingError( "Tagging failed due to too many conflicting updates from other processes" ) # No 409's seen... raise a more generic error raise MetaflowTaggingError("Tagging failed after %d tries" % tries) time.sleep(0.3 * random.uniform(1.4, 1.6) ** tries) tries += 1 @classmethod def _get_object_internal( cls, obj_type, obj_order, sub_type, sub_order, filters, attempt, *args ): if attempt is not None: if cls._supports_attempt_gets is None: version = cls._version(None) cls._supports_attempt_gets = version is not None and LooseVersion( version ) >= LooseVersion("2.0.6") if not cls._supports_attempt_gets: raise ServiceException( "Getting specific attempts of Tasks or Artifacts requires " "the metaflow service to be at least version 2.0.6. Please " "upgrade your service" ) if sub_type == "self": if obj_type == "artifact": # Special case with the artifacts; we add the attempt url = ServiceMetadataProvider._obj_path( *args[:obj_order], attempt=attempt ) else: url = ServiceMetadataProvider._obj_path(*args[:obj_order]) try: v, _ = cls._request(None, url, "GET") return MetadataProvider._apply_filter([v], filters)[0] except ServiceException as ex: if ex.http_code == 404: return None raise # For the other types, we locate all the objects we need to find and return them if obj_type != "root": url = ServiceMetadataProvider._obj_path(*args[:obj_order]) else: url = "" if sub_type == "metadata": url += "/metadata" elif sub_type == "artifact" and obj_type == "task" and attempt is not None: url += "/attempt/%s/artifacts" % attempt else: url += "/%ss" % sub_type try: v, _ = cls._request(None, url, "GET") return MetadataProvider._apply_filter(v, filters) except ServiceException as ex: if ex.http_code == 404: return None raise def _new_run(self, run_id=None, tags=None, sys_tags=None): # first ensure that the flow exists self._get_or_create("flow") run, did_create = self._get_or_create( "run", run_id, tags=tags, sys_tags=sys_tags ) return str(run["run_number"]), did_create def _new_task( self, run_id, step_name, task_id=None, attempt=0, tags=None, sys_tags=None ): # first ensure that the step exists self._get_or_create("step", run_id, step_name) task, did_create = self._get_or_create( "task", run_id, step_name, task_id, tags=tags, sys_tags=sys_tags ) if did_create: self._register_system_metadata(run_id, step_name, task["task_id"], attempt) return task["task_id"], did_create @staticmethod def _obj_path( flow_name, run_id=None, step_name=None, task_id=None, artifact_name=None, attempt=None, ): object_path = "/flows/%s" % flow_name if run_id is not None: object_path += "/runs/%s" % run_id if step_name is not None: object_path += "/steps/%s" % step_name if task_id is not None: object_path += "/tasks/%s" % task_id if artifact_name is not None: object_path += "/artifacts/%s" % artifact_name if attempt is not None: object_path += "/attempt/%s" % attempt return object_path @staticmethod def _create_path(obj_type, flow_name, run_id=None, step_name=None): create_path = "/flows/%s" % flow_name if obj_type == "flow": return create_path if obj_type == "run": return create_path + "/run" create_path += "/runs/%s/steps/%s" % (run_id, step_name) if obj_type == "step": return create_path + "/step" return create_path + "/task" def _get_or_create( self, obj_type, run_id=None, step_name=None, task_id=None, tags=None, sys_tags=None, ): if tags is None: tags = set() if sys_tags is None: sys_tags = set() def create_object(): data = self._object_to_json( obj_type, run_id, step_name, task_id, self.sticky_tags.union(tags), self.sticky_sys_tags.union(sys_tags), ) return self._request( self._monitor, create_path, "POST", data=data, retry_409_path=obj_path ) always_create = False obj_path = self._obj_path(self._flow_name, run_id, step_name, task_id) create_path = self._create_path(obj_type, self._flow_name, run_id, step_name) if obj_type == "run" and run_id is None: always_create = True elif obj_type == "task" and task_id is None: always_create = True if always_create: return create_object() try: return self._request(self._monitor, obj_path, "GET") except ServiceException as ex: if ex.http_code == 404: return create_object() else: raise # TODO _request() needs a more deliberate refactor at some point, it looks quite overgrown. @classmethod def _request( cls, monitor, path, method, data=None, retry_409_path=None, return_raw_resp=False, ): if cls.INFO is None: raise MetaflowException( "Missing Metaflow Service URL. " "Specify with METAFLOW_SERVICE_URL environment variable" ) supported_methods = ("GET", "PATCH", "POST") if method not in supported_methods: raise MetaflowException( "Only these methods are supported: %s, but got %s" % (supported_methods, method) ) url = os.path.join(cls.INFO, path.lstrip("/")) for i in range(METADATA_SERVICE_NUM_RETRIES): try: if method == "GET": if monitor: with monitor.measure("metaflow.service_metadata.get"): resp = requests.get(url, headers=METADATA_SERVICE_HEADERS) else: resp = requests.get(url, headers=METADATA_SERVICE_HEADERS) elif method == "POST": if monitor: with monitor.measure("metaflow.service_metadata.post"): resp = requests.post( url, headers=METADATA_SERVICE_HEADERS, json=data ) else: resp = requests.post( url, headers=METADATA_SERVICE_HEADERS, json=data ) elif method == "PATCH": if monitor: with monitor.measure("metaflow.service_metadata.patch"): resp = requests.patch( url, headers=METADATA_SERVICE_HEADERS, json=data ) else: resp = requests.patch( url, headers=METADATA_SERVICE_HEADERS, json=data ) else: raise MetaflowInternalError("Unexpected HTTP method %s" % (method,)) except MetaflowInternalError: raise except: # noqa E722 if monitor: with monitor.count("metaflow.service_metadata.failed_request"): if i == METADATA_SERVICE_NUM_RETRIES - 1: raise else: if i == METADATA_SERVICE_NUM_RETRIES - 1: raise resp = None else: if return_raw_resp: return resp, True if resp.status_code < 300: return resp.json(), True elif resp.status_code == 409 and data is not None: # a special case: the post fails due to a conflict # this could occur when we missed a success response # from the first POST request but the request # actually went though, so a subsequent POST # returns 409 (conflict) or we end up with a # conflict while running on AWS Step Functions # instead of retrying the post we retry with a get since # the record is guaranteed to exist if retry_409_path: v, _ = cls._request(monitor, retry_409_path, "GET") return v, False else: return None, False elif resp.status_code != 503: raise ServiceException( "Metadata request (%s) failed (code %s): %s" % (path, resp.status_code, resp.text), resp.status_code, resp.text, ) time.sleep(2 ** i) if resp: raise ServiceException( "Metadata request (%s) failed (code %s): %s" % (path, resp.status_code, resp.text), resp.status_code, resp.text, ) else: raise ServiceException("Metadata request (%s) failed" % path) @classmethod def _version(cls, monitor): if cls.INFO is None: raise MetaflowException( "Missing Metaflow Service URL. " "Specify with METAFLOW_SERVICE_URL environment variable" ) path = "ping" url = os.path.join(cls.INFO, path) for i in range(METADATA_SERVICE_NUM_RETRIES): try: if monitor: with monitor.measure("metaflow.service_metadata.get"): resp = requests.get(url, headers=METADATA_SERVICE_HEADERS) else: resp = requests.get(url, headers=METADATA_SERVICE_HEADERS) except: if monitor: with monitor.count("metaflow.service_metadata.failed_request"): if i == METADATA_SERVICE_NUM_RETRIES - 1: raise else: if i == METADATA_SERVICE_NUM_RETRIES - 1: raise resp = None else: if resp.status_code < 300: return resp.headers.get("METADATA_SERVICE_VERSION", None) elif resp.status_code != 503: raise ServiceException( "Metadata request (%s) failed" " (code %s): %s" % (url, resp.status_code, resp.text), resp.status_code, resp.text, ) time.sleep(2 ** i) if resp: raise ServiceException( "Metadata request (%s) failed (code %s): %s" % (url, resp.status_code, resp.text), resp.status_code, resp.text, ) else: raise ServiceException("Metadata request (%s) failed" % url)
class BatchDecorator(StepDecorator): """ Step decorator to specify that this step should execute on AWS Batch. This decorator indicates that your step should execute on AWS Batch. Note that you can apply this decorator automatically to all steps using the ```--with batch``` argument when calling run/resume. Step level decorators within the code are overrides and will force a step to execute on AWS Batch regardless of the ```--with``` specification. To use, annotate your step as follows: ``` @batch @step def my_step(self): ... ``` Parameters ---------- cpu : int Number of CPUs required for this step. Defaults to 1. If @resources is also present, the maximum value from all decorators is used gpu : int Number of GPUs required for this step. Defaults to 0. If @resources is also present, the maximum value from all decorators is used memory : int Memory size (in MB) required for this step. Defaults to 4096. If @resources is also present, the maximum value from all decorators is used image : string Docker image to use when launching on AWS Batch. If not specified, a default docker image mapping to the current version of Python is used queue : string AWS Batch Job Queue to submit the job to. Defaults to the one specified by the environment variable METAFLOW_BATCH_JOB_QUEUE iam_role : string AWS IAM role that AWS Batch container uses to access AWS cloud resources (Amazon S3, Amazon DynamoDb, etc). Defaults to the one specified by the environment variable METAFLOW_ECS_S3_ACCESS_IAM_ROLE execution_role : string AWS IAM role that AWS Batch can use to trigger AWS Fargate tasks. Defaults to the one determined by the environment variable METAFLOW_ECS_FARGATE_EXECUTION_ROLE https://docs.aws.amazon.com/batch/latest/userguide/execution-IAM-role.html shared_memory : int The value for the size (in MiB) of the /dev/shm volume for this step. This parameter maps to the --shm-size option to docker run. max_swap : int The total amount of swap memory (in MiB) a container can use for this step. This parameter is translated to the --memory-swap option to docker run where the value is the sum of the container memory plus the max_swap value. swappiness : int This allows you to tune memory swappiness behavior for this step. A swappiness value of 0 causes swapping not to happen unless absolutely necessary. A swappiness value of 100 causes pages to be swapped very aggressively. Accepted values are whole numbers between 0 and 100. """ name = "batch" defaults = { "cpu": None, "gpu": None, "memory": None, "image": None, "queue": BATCH_JOB_QUEUE, "iam_role": ECS_S3_ACCESS_IAM_ROLE, "execution_role": ECS_FARGATE_EXECUTION_ROLE, "shared_memory": None, "max_swap": None, "swappiness": None, "host_volumes": None, } resource_defaults = { "cpu": "1", "gpu": "0", "memory": "4096", } package_url = None package_sha = None run_time_limit = None def __init__(self, attributes=None, statically_defined=False): super(BatchDecorator, self).__init__(attributes, statically_defined) # If no docker image is explicitly specified, impute a default image. if not self.attributes["image"]: # If metaflow-config specifies a docker image, just use that. if BATCH_CONTAINER_IMAGE: self.attributes["image"] = BATCH_CONTAINER_IMAGE # If metaflow-config doesn't specify a docker image, assign a # default docker image. else: # Metaflow-R has it's own default docker image (rocker family) if R.use_r(): self.attributes["image"] = R.container_image() # Default to vanilla Python image corresponding to major.minor # version of the Python interpreter launching the flow. else: self.attributes["image"] = "python:%s.%s" % ( platform.python_version_tuple()[0], platform.python_version_tuple()[1], ) # Assign docker registry URL for the image. if not get_docker_registry(self.attributes["image"]): if BATCH_CONTAINER_REGISTRY: self.attributes["image"] = "%s/%s" % ( BATCH_CONTAINER_REGISTRY.rstrip("/"), self.attributes["image"], ) # Refer https://github.com/Netflix/metaflow/blob/master/docs/lifecycle.png # to understand where these functions are invoked in the lifecycle of a # Metaflow flow. def step_init(self, flow, graph, step, decos, environment, flow_datastore, logger): if flow_datastore.TYPE != "s3": raise BatchException( "The *@batch* decorator requires --datastore=s3.") # Set internal state. self.logger = logger self.environment = environment self.step = step self.flow_datastore = flow_datastore self.attributes.update( compute_resource_attributes(decos, self, self.resource_defaults)) # Set run time limit for the AWS Batch job. self.run_time_limit = get_run_time_limit_for_task(decos) if self.run_time_limit < 60: raise BatchException( "The timeout for step *{step}* should be at " "least 60 seconds for execution on AWS Batch.".format( step=step)) def runtime_init(self, flow, graph, package, run_id): # Set some more internal state. self.flow = flow self.graph = graph self.package = package self.run_id = run_id def runtime_task_created(self, task_datastore, task_id, split_index, input_paths, is_cloned, ubf_context): if not is_cloned: self._save_package_once(self.flow_datastore, self.package) def runtime_step_cli(self, cli_args, retry_count, max_user_code_retries, ubf_context): if retry_count <= max_user_code_retries: # after all attempts to run the user code have failed, we don't need # to execute on AWS Batch anymore. We can execute possible fallback # code locally. cli_args.commands = ["batch", "step"] cli_args.command_args.append(self.package_sha) cli_args.command_args.append(self.package_url) cli_args.command_options.update(self.attributes) cli_args.command_options["run-time-limit"] = self.run_time_limit if not R.use_r(): cli_args.entrypoint[0] = sys.executable def task_pre_step( self, step_name, task_datastore, metadata, run_id, task_id, flow, graph, retry_count, max_retries, ubf_context, inputs, ): self.metadata = metadata self.task_datastore = task_datastore # task_pre_step may run locally if fallback is activated for @catch # decorator. In that scenario, we skip collecting AWS Batch execution # metadata. A rudimentary way to detect non-local execution is to # check for the existence of AWS_BATCH_JOB_ID environment variable. if "AWS_BATCH_JOB_ID" in os.environ: meta = {} meta["aws-batch-job-id"] = os.environ["AWS_BATCH_JOB_ID"] meta["aws-batch-job-attempt"] = os.environ["AWS_BATCH_JOB_ATTEMPT"] meta["aws-batch-ce-name"] = os.environ["AWS_BATCH_CE_NAME"] meta["aws-batch-jq-name"] = os.environ["AWS_BATCH_JQ_NAME"] meta["aws-batch-execution-env"] = os.environ["AWS_EXECUTION_ENV"] # Capture AWS Logs metadata. This is best effort only since # only V4 of the metadata uri for the ECS container hosts this # information and it is quite likely that not all consumers of # Metaflow would be running the container agent compatible with # version V4. # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-metadata-endpoint.html try: logs_meta = (requests.get( url=os.environ["ECS_CONTAINER_METADATA_URI_V4"]).json(). get("LogOptions", {})) meta["aws-batch-awslogs-group"] = logs_meta.get( "awslogs-group") meta["aws-batch-awslogs-region"] = logs_meta.get( "awslogs-region") meta["aws-batch-awslogs-stream"] = logs_meta.get( "awslogs-stream") except: pass entries = [ MetaDatum( field=k, value=v, type=k, tags=["attempt_id:{0}".format(retry_count)], ) for k, v in meta.items() ] # Register book-keeping metadata for debugging. metadata.register_metadata(run_id, step_name, task_id, entries) self._save_logs_sidecar = Sidecar("save_logs_periodically") self._save_logs_sidecar.start() num_parallel = int(os.environ.get("AWS_BATCH_JOB_NUM_NODES", 0)) if num_parallel >= 1 and ubf_context == UBF_CONTROL: # UBF handling for multinode case control_task_id = current.task_id top_task_id = control_task_id.replace("control-", "") # chop "-0" mapper_task_ids = [control_task_id] + [ "%s-node-%d" % (top_task_id, node_idx) for node_idx in range(1, num_parallel) ] flow._control_mapper_tasks = [ "%s/%s/%s" % (run_id, step_name, mapper_task_id) for mapper_task_id in mapper_task_ids ] flow._control_task_is_mapper_zero = True if num_parallel >= 1: _setup_multinode_environment() def task_finished(self, step_name, flow, graph, is_task_ok, retry_count, max_retries): # task_finished may run locally if fallback is activated for @catch # decorator. if "AWS_BATCH_JOB_ID" in os.environ: # If `local` metadata is configured, we would need to copy task # execution metadata from the AWS Batch container to user's # local file system after the user code has finished execution. # This happens via datastore as a communication bridge. if self.metadata.TYPE == "local": # Note that the datastore is *always* Amazon S3 (see # runtime_task_created function). sync_local_metadata_to_datastore(DATASTORE_LOCAL_DIR, self.task_datastore) try: self._save_logs_sidecar.terminate() except: # Best effort kill pass if is_task_ok and len(getattr(flow, "_control_mapper_tasks", [])) > 1: self._wait_for_mapper_tasks(flow, step_name) def _wait_for_mapper_tasks(self, flow, step_name): """ When lauching multinode task with UBF, need to wait for the secondary tasks to finish cleanly and produce their output before exiting the main task. Otherwise main task finishing will cause secondary nodes to terminate immediately, and possibly prematurely. """ from metaflow import Step # avoid circular dependency TIMEOUT = 600 last_completion_timeout = time.time() + TIMEOUT print("Waiting for batch secondary tasks to finish") while last_completion_timeout > time.time(): time.sleep(2) try: step_path = "%s/%s/%s" % (flow.name, current.run_id, step_name) tasks = [task for task in Step(step_path)] if len(tasks) == len(flow._control_mapper_tasks): if all(task.finished_at is not None for task in tasks): # for some reason task.finished fails return True else: print( "Waiting for all parallel tasks to finish. Finished: {}/{}" .format( len(tasks), len(flow._control_mapper_tasks), )) except Exception as e: pass raise Exception( "Batch secondary workers did not finish in %s seconds" % TIMEOUT) @classmethod def _save_package_once(cls, flow_datastore, package): if cls.package_url is None: cls.package_url, cls.package_sha = flow_datastore.save_data( [package.blob], len_hint=1)[0]
def task_pre_step( self, step_name, task_datastore, metadata, run_id, task_id, flow, graph, retry_count, max_retries, ubf_context, inputs, ): self.metadata = metadata self.task_datastore = task_datastore # task_pre_step may run locally if fallback is activated for @catch # decorator. In that scenario, we skip collecting AWS Batch execution # metadata. A rudimentary way to detect non-local execution is to # check for the existence of AWS_BATCH_JOB_ID environment variable. if "AWS_BATCH_JOB_ID" in os.environ: meta = {} meta["aws-batch-job-id"] = os.environ["AWS_BATCH_JOB_ID"] meta["aws-batch-job-attempt"] = os.environ["AWS_BATCH_JOB_ATTEMPT"] meta["aws-batch-ce-name"] = os.environ["AWS_BATCH_CE_NAME"] meta["aws-batch-jq-name"] = os.environ["AWS_BATCH_JQ_NAME"] meta["aws-batch-execution-env"] = os.environ["AWS_EXECUTION_ENV"] # Capture AWS Logs metadata. This is best effort only since # only V4 of the metadata uri for the ECS container hosts this # information and it is quite likely that not all consumers of # Metaflow would be running the container agent compatible with # version V4. # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-metadata-endpoint.html try: logs_meta = (requests.get( url=os.environ["ECS_CONTAINER_METADATA_URI_V4"]).json(). get("LogOptions", {})) meta["aws-batch-awslogs-group"] = logs_meta.get( "awslogs-group") meta["aws-batch-awslogs-region"] = logs_meta.get( "awslogs-region") meta["aws-batch-awslogs-stream"] = logs_meta.get( "awslogs-stream") except: pass entries = [ MetaDatum( field=k, value=v, type=k, tags=["attempt_id:{0}".format(retry_count)], ) for k, v in meta.items() ] # Register book-keeping metadata for debugging. metadata.register_metadata(run_id, step_name, task_id, entries) self._save_logs_sidecar = Sidecar("save_logs_periodically") self._save_logs_sidecar.start() num_parallel = int(os.environ.get("AWS_BATCH_JOB_NUM_NODES", 0)) if num_parallel >= 1 and ubf_context == UBF_CONTROL: # UBF handling for multinode case control_task_id = current.task_id top_task_id = control_task_id.replace("control-", "") # chop "-0" mapper_task_ids = [control_task_id] + [ "%s-node-%d" % (top_task_id, node_idx) for node_idx in range(1, num_parallel) ] flow._control_mapper_tasks = [ "%s/%s/%s" % (run_id, step_name, mapper_task_id) for mapper_task_id in mapper_task_ids ] flow._control_task_is_mapper_zero = True if num_parallel >= 1: _setup_multinode_environment()