def task_pre_step(self, step_name, ds, meta, run_id, task_id, flow, graph, retry_count, max_retries): meta.register_metadata(run_id, step_name, task_id, [ MetaDatum(field='conda_env_id', value=self._env_id(), type='conda_env_id') ])
def task_pre_step( self, step_name, task_datastore, metadata, run_id, task_id, flow, graph, retry_count, max_user_code_retries, ubf_context, inputs, ): self.task_id = task_id meta = {} meta["argo-workflow-template"] = os.environ["ARGO_WORKFLOW_TEMPLATE"] meta["argo-workflow-name"] = os.environ["ARGO_WORKFLOW_NAME"] meta["argo-workflow-namespace"] = os.environ["ARGO_WORKFLOW_NAMESPACE"] entries = [ MetaDatum(field=k, value=v, type=k, tags=["attempt_id:{0}".format(retry_count)]) for k, v in meta.items() ] # Register book-keeping metadata for debugging. metadata.register_metadata(run_id, step_name, task_id, entries)
def task_pre_step( self, step_name, task_datastore, metadata, run_id, task_id, flow, graph, retry_count, max_user_code_retries, ubf_context, inputs, ): meta = {} meta["aws-step-functions-execution"] = os.environ["METAFLOW_RUN_ID"] meta["aws-step-functions-state-machine"] = os.environ[ "SFN_STATE_MACHINE"] entries = [ MetaDatum(field=k, value=v, type=k, tags=["attempt_id:{0}".format(retry_count)]) for k, v in meta.items() ] # Register book-keeping metadata for debugging. metadata.register_metadata(run_id, step_name, task_id, entries)
def task_pre_step(self, step_name, ds, meta, run_id, task_id, flow, graph, retry_count, max_retries, ubf_context, inputs): if self.is_enabled(ubf_context): meta.register_metadata(run_id, step_name, task_id, [ MetaDatum(field='conda_env_id', value=self._env_id(), type='conda_env_id', tags=[]) ])
def task_pre_step( self, step_name, task_datastore, metadata, run_id, task_id, flow, graph, retry_count, max_retries, ubf_context, inputs, ): self.metadata = metadata self.task_datastore = task_datastore # task_pre_step may run locally if fallback is activated for @catch # decorator. In that scenario, we skip collecting Kubernetes execution # metadata. A rudimentary way to detect non-local execution is to # check for the existence of METAFLOW_KUBERNETES_WORKLOAD environment # variable. if "METAFLOW_KUBERNETES_WORKLOAD" in os.environ: meta = {} meta["kubernetes-pod-name"] = os.environ[ "METAFLOW_KUBERNETES_POD_NAME"] meta["kubernetes-pod-namespace"] = os.environ[ "METAFLOW_KUBERNETES_POD_NAMESPACE"] meta["kubernetes-pod-id"] = os.environ[ "METAFLOW_KUBERNETES_POD_ID"] meta["kubernetes-pod-service-account-name"] = os.environ[ "METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME"] # Unfortunately, there doesn't seem to be any straight forward way right # now to attach the Batch/v1 name - While we can rely on a hacky approach # given we know that the pod name is simply a unique suffix with a hyphen # delimiter to the Batch/v1 name - this approach will fail if the Batch/v1 # name is closer to 63 chars where the pod name will truncate the Batch/v1 # name. # if "ARGO_WORKFLOW_NAME" not in os.environ: # meta["kubernetes-job-name"] = os.environ[ # "METAFLOW_KUBERNETES_POD_NAME" # ].rpartition("-")[0] entries = [ MetaDatum(field=k, value=v, type=k, tags=[]) for k, v in meta.items() ] # Register book-keeping metadata for debugging. metadata.register_metadata(run_id, step_name, task_id, entries) # Start MFLog sidecar to collect task logs. self._save_logs_sidecar = Sidecar("save_logs_periodically") self._save_logs_sidecar.start()
def task_pre_step(self, step_name, datastore, metadata, run_id, task_id, flow, graph, retry_count, max_user_code_retries): meta = {} meta['aws-step-functions-execution'] = os.environ['METAFLOW_RUN_ID'] meta['aws-step-functions-state-machine'] =\ os.environ['SFN_STATE_MACHINE'] entries = [ MetaDatum(field=k, value=v, type=k) for k, v in meta.items() ] # Register book-keeping metadata for debugging. metadata.register_metadata(run_id, step_name, task_id, entries)
def task_pre_step(self, step_name, ds, metadata, run_id, task_id, flow, graph, retry_count, max_retries): if metadata.TYPE == 'local': self.ds_root = ds.root else: self.ds_root = None meta = {} meta['aws-batch-job-id'] = os.environ['AWS_BATCH_JOB_ID'] meta['aws-batch-job-attempt'] = os.environ['AWS_BATCH_JOB_ATTEMPT'] meta['aws-batch-ce-name'] = os.environ['AWS_BATCH_CE_NAME'] meta['aws-batch-jq-name'] = os.environ['AWS_BATCH_JQ_NAME'] entries = [ MetaDatum(field=k, value=v, type=k) for k, v in meta.items() ] # Register book-keeping metadata for debugging. metadata.register_metadata(run_id, step_name, task_id, entries)
def task_pre_step( self, step_name, task_datastore, metadata, run_id, task_id, flow, graph, retry_count, max_retries, ubf_context, inputs, ): self.metadata = metadata self.task_datastore = task_datastore # task_pre_step may run locally if fallback is activated for @catch # decorator. In that scenario, we skip collecting Kubernetes execution # metadata. A rudimentary way to detect non-local execution is to # check for the existence of METAFLOW_KUBERNETES_WORKLOAD environment # variable. if "METAFLOW_KUBERNETES_WORKLOAD" in os.environ: meta = {} # TODO: Get kubernetes job id and job name meta["kubernetes-pod-id"] = os.environ[ "METAFLOW_KUBERNETES_POD_ID"] meta["kubernetes-pod-name"] = os.environ[ "METAFLOW_KUBERNETES_POD_NAME"] meta["kubernetes-pod-namespace"] = os.environ[ "METAFLOW_KUBERNETES_POD_NAMESPACE"] # meta['kubernetes-job-attempt'] = ? entries = [ MetaDatum(field=k, value=v, type=k, tags=[]) for k, v in meta.items() ] # Register book-keeping metadata for debugging. metadata.register_metadata(run_id, step_name, task_id, entries) # Start MFLog sidecar to collect task logs. self._save_logs_sidecar = SidecarSubProcess( "save_logs_periodically")
def task_pre_step(self, step_name, ds, metadata, run_id, task_id, flow, graph, retry_count, max_retries, ubf_context, inputs): if metadata.TYPE == 'local': self.ds_root = ds.root else: self.ds_root = None meta = {} meta['aws-batch-job-id'] = os.environ['AWS_BATCH_JOB_ID'] meta['aws-batch-job-attempt'] = os.environ['AWS_BATCH_JOB_ATTEMPT'] meta['aws-batch-ce-name'] = os.environ['AWS_BATCH_CE_NAME'] meta['aws-batch-jq-name'] = os.environ['AWS_BATCH_JQ_NAME'] meta['aws-batch-execution-env'] = os.environ['AWS_EXECUTION_ENV'] # Capture AWS Logs metadata. This is best effort only since # only V4 of the metadata uri for the ECS container hosts this # information and it is quite likely that not all consumers of # Metaflow would be running the container agent compatible with # version V4. # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-metadata-endpoint.html try: logs_meta = requests.get( url=os.environ['ECS_CONTAINER_METADATA_URI_V4']) \ .json() \ .get('LogOptions', {}) meta['aws-batch-awslogs-group'] = logs_meta.get('awslogs-group') meta['aws-batch-awslogs-region'] = logs_meta.get('awslogs-region') meta['aws-batch-awslogs-stream'] = logs_meta.get('awslogs-stream') except: pass entries = [MetaDatum(field=k, value=v, type=k, tags=[]) for k, v in meta.items()] # Register book-keeping metadata for debugging. metadata.register_metadata(run_id, step_name, task_id, entries) self._save_logs_sidecar = SidecarSubProcess('save_logs_periodically')
def task_pre_step( self, step_name, task_datastore, meta, run_id, task_id, flow, graph, retry_count, max_retries, ubf_context, inputs, ): if self.is_enabled(ubf_context): # Add the Python interpreter's parent to the path. This is to # ensure that any non-pythonic dependencies introduced by the conda # environment are visible to the user code. env_path = os.path.dirname(sys.executable) if os.environ.get("PATH") is not None: env_path = os.pathsep.join([env_path, os.environ["PATH"]]) os.environ["PATH"] = env_path meta.register_metadata( run_id, step_name, task_id, [ MetaDatum( field="conda_env_id", value=self._env_id(), type="conda_env_id", tags=["attempt_id:{0}".format(retry_count)], ) ], )
def task_pre_step( self, step_name, task_datastore, metadata, run_id, task_id, flow, graph, retry_count, max_retries, ubf_context, inputs, ): self.metadata = metadata self.task_datastore = task_datastore # task_pre_step may run locally if fallback is activated for @catch # decorator. In that scenario, we skip collecting AWS Batch execution # metadata. A rudimentary way to detect non-local execution is to # check for the existence of AWS_BATCH_JOB_ID environment variable. if "AWS_BATCH_JOB_ID" in os.environ: meta = {} meta["aws-batch-job-id"] = os.environ["AWS_BATCH_JOB_ID"] meta["aws-batch-job-attempt"] = os.environ["AWS_BATCH_JOB_ATTEMPT"] meta["aws-batch-ce-name"] = os.environ["AWS_BATCH_CE_NAME"] meta["aws-batch-jq-name"] = os.environ["AWS_BATCH_JQ_NAME"] meta["aws-batch-execution-env"] = os.environ["AWS_EXECUTION_ENV"] # Capture AWS Logs metadata. This is best effort only since # only V4 of the metadata uri for the ECS container hosts this # information and it is quite likely that not all consumers of # Metaflow would be running the container agent compatible with # version V4. # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-metadata-endpoint.html try: logs_meta = (requests.get( url=os.environ["ECS_CONTAINER_METADATA_URI_V4"]).json(). get("LogOptions", {})) meta["aws-batch-awslogs-group"] = logs_meta.get( "awslogs-group") meta["aws-batch-awslogs-region"] = logs_meta.get( "awslogs-region") meta["aws-batch-awslogs-stream"] = logs_meta.get( "awslogs-stream") except: pass entries = [ MetaDatum( field=k, value=v, type=k, tags=["attempt_id:{0}".format(retry_count)], ) for k, v in meta.items() ] # Register book-keeping metadata for debugging. metadata.register_metadata(run_id, step_name, task_id, entries) self._save_logs_sidecar = SidecarSubProcess( "save_logs_periodically") num_parallel = int(os.environ.get("AWS_BATCH_JOB_NUM_NODES", 0)) if num_parallel >= 1 and ubf_context == UBF_CONTROL: # UBF handling for multinode case control_task_id = current.task_id top_task_id = control_task_id.replace("control-", "") # chop "-0" mapper_task_ids = [control_task_id] + [ "%s-node-%d" % (top_task_id, node_idx) for node_idx in range(1, num_parallel) ] flow._control_mapper_tasks = [ "%s/%s/%s" % (run_id, step_name, mapper_task_id) for mapper_task_id in mapper_task_ids ] flow._control_task_is_mapper_zero = True if num_parallel >= 1: _setup_multinode_environment()