def control_task_step_func(self, flow, graph, retry_count): from metaflow import current run_id = current.run_id step_name = current.step_name control_task_id = current.task_id (_, split_step_name, split_task_id) = control_task_id.split('-')[1:] # If we are running inside Conda, we use the base executable FIRST; # the conda environment will then be used when runtime_step_cli is # called. This is so that it can properly set up all the metaflow # aliases needed. env_to_use = getattr(self.environment, 'base_env', self.environment) executable = env_to_use.executable(step_name) script = sys.argv[0] # Access the `unbounded_foreach` param using `flow` (as datastore). assert(flow._unbounded_foreach) foreach_iter = flow.input if not isinstance(foreach_iter, InternalTestUnboundedForeachInput): raise MetaflowException('Expected type to be '\ 'InternalTestUnboundedForeachInput. Found %s'\ % (type(foreach_iter))) foreach_num_splits = sum(1 for _ in foreach_iter) print('Simulating UnboundedForeach over value:', foreach_iter, 'num_splits:', foreach_num_splits) mapper_tasks = [] for i in range(foreach_num_splits): task_id = \ '%s-%d' % (control_task_id.replace('control-', 'test-ubf-'), i) pathspec = '%s/%s/%s' % (run_id, step_name, task_id) mapper_tasks.append(to_unicode(pathspec)) input_paths = '%s/%s/%s' % (run_id, split_step_name, split_task_id) # Override specific `step` kwargs. kwargs = cli_args.step_kwargs kwargs['split_index'] = str(i) kwargs['run_id'] = run_id kwargs['task_id'] = task_id kwargs['input_paths'] = input_paths kwargs['ubf_context'] = UBF_TASK kwargs['retry_count'] = 0 cmd = cli_args.step_command(executable, script, step_name, step_kwargs=kwargs) step_cli = u' '.join(cmd) # Print cmdline for execution. Doesn't work without the temporary # unicode object while using `print`. print(u'[${cwd}] Starting split#{split} with cmd:{cmd}'\ .format(cwd=os.getcwd(), split=i, cmd=step_cli)) output_bytes = subprocess.check_output(cmd) output = to_unicode(output_bytes) for line in output.splitlines(): print('[Split#%d] %s' % (i, line)) # Save the list of (child) mapper task pathspec(s) into a designated # artifact `_control_mapper_tasks`. flow._control_mapper_tasks = mapper_tasks
def resolve_workflow_name(obj, name): project = current.get("project_name") obj._is_workflow_name_modified = False if project: if name: raise MetaflowException( "--name is not supported for @projects. Use --branch instead.") workflow_name = current.project_flow_name project_branch = to_bytes(".".join((project, current.branch_name))) token_prefix = ( "mfprj-%s" % to_unicode(base64.b32encode(sha1(project_branch).digest()))[:16]) is_project = True # Argo Workflow names can't be longer than 253 characters, so we truncate # by default. Also, while project and branch allow for underscores, Argo # Workflows doesn't (DNS Subdomain names as defined in RFC 1123) - so we will # remove any underscores as well as convert the name to lower case. if len(workflow_name) > 253: name_hash = to_unicode( base64.b32encode(sha1( to_bytes(workflow_name)).digest()))[:8].lower() workflow_name = "%s-%s" % (workflow_name[:242], name_hash) obj._is_workflow_name_modified = True if not VALID_NAME.search(workflow_name): workflow_name = (re.compile(r"^[^A-Za-z0-9]+").sub( "", workflow_name).replace("_", "").lower()) obj._is_workflow_name_modified = True else: if name and not VALID_NAME.search(name): raise MetaflowException( "Name '%s' contains invalid characters. The " "name must consist of lower case alphanumeric characters, '-' or '.'" ", and must start and end with an alphanumeric character." % name) workflow_name = name if name else current.flow_name token_prefix = workflow_name is_project = False if len(workflow_name) > 253: msg = ("The full name of the workflow:\n*%s*\nis longer than 253 " "characters.\n\n" "To deploy this workflow to Argo Workflows, please " "assign a shorter name\nusing the option\n" "*argo-workflows --name <name> create*." % workflow_name) raise ArgoWorkflowsNameTooLong(msg) if not VALID_NAME.search(workflow_name): workflow_name = (re.compile(r"^[^A-Za-z0-9]+").sub( "", workflow_name).replace("_", "").lower()) obj._is_workflow_name_modified = True return workflow_name, token_prefix.lower(), is_project
def resolve_state_machine_name(obj, name): def attach_prefix(name): if SFN_STATE_MACHINE_PREFIX is not None: return SFN_STATE_MACHINE_PREFIX + "_" + name return name project = current.get("project_name") obj._is_state_machine_name_hashed = False if project: if name: raise MetaflowException( "--name is not supported for @projects. " "Use --branch instead." ) state_machine_name = attach_prefix(current.project_flow_name) project_branch = to_bytes(".".join((project, current.branch_name))) token_prefix = ( "mfprj-%s" % to_unicode(base64.b32encode(sha1(project_branch).digest()))[:16] ) is_project = True # AWS Step Functions has a limit of 80 chars for state machine names. # We truncate the state machine name if the computed name is greater # than 60 chars and append a hashed suffix to ensure uniqueness. if len(state_machine_name) > 60: name_hash = to_unicode( base64.b32encode(sha1(to_bytes(state_machine_name)).digest()) )[:16].lower() state_machine_name = "%s-%s" % (state_machine_name[:60], name_hash) obj._is_state_machine_name_hashed = True else: if name and VALID_NAME.search(name): raise MetaflowException("Name '%s' contains invalid characters." % name) state_machine_name = attach_prefix(name if name else current.flow_name) token_prefix = state_machine_name is_project = False if len(state_machine_name) > 80: msg = ( "The full name of the workflow:\n*%s*\nis longer than 80 " "characters.\n\n" "To deploy this workflow to AWS Step Functions, please " "assign a shorter name\nusing the option\n" "*step-functions --name <name> create*." % state_machine_name ) raise StepFunctionsStateMachineNameTooLong(msg) return state_machine_name, token_prefix.lower(), is_project
def print_all(tail): for line in tail: if line: echo(self.job.id, util.to_unicode(line)) else: return tail, False return tail, True
def loglines(self, stream, as_unicode=True): """ Return an iterator over (utc_timestamp, logline) tuples. If as_unicode=False, logline is returned as a byte object. Otherwise, it is returned as a (unicode) string. """ from metaflow.mflog.mflog import merge_logs global filecache ds_type = self.metadata_dict.get("ds-type") ds_root = self.metadata_dict.get("ds-root") if ds_type is None or ds_root is None: yield None, "" return if filecache is None: filecache = FileCache() attempt = self.current_attempt logs = filecache.get_logs_stream(ds_type, ds_root, stream, attempt, *self.path_components) for line in merge_logs([blob for _, blob in logs]): msg = to_unicode(line.msg) if as_unicode else line.msg yield line.utc_tstamp, msg
def _options(mapping): for k, v in mapping.items(): if v: k = k.replace("_", "-") v = v if isinstance(v, (list, tuple, set)) else [v] for value in v: yield "--%s" % k if not isinstance(value, bool): yield to_unicode(value)
def parse(line): line = to_bytes(line) m = LINE_PARSER.match(to_bytes(line)) if m: try: fields = list(m.groups()) fields.append(datetime.strptime(to_unicode(fields[2]), ISOFORMAT)) return MFLogline(*fields) except: pass
def format(name): # AWS Event Bridge has a limit of 64 chars for rule names. # We truncate the rule name if the computed name is greater # than 64 chars and append a hashed suffix to ensure uniqueness. if len(name) > 64: name_hash = to_unicode(base64.b32encode(sha1( to_bytes(name)).digest()))[:16].lower() # construct an 64 character long rule name return '%s-%s' % (name[:47], name_hash) else: return name
def _available_logs(tail, stream, echo, should_persist=False): # print the latest batch of lines try: for line in tail: if should_persist: line = set_should_persist(line) else: line = refine(line, prefix=prefix) echo(line.strip().decode("utf-8", errors="replace"), stream) except Exception as ex: echo( "%s[ temporary error in fetching logs: %s ]" % (to_unicode(prefix), ex), "stderr", )
def _walk(self, root): root = to_unicode(root) # handle files/folder with non ascii chars prefixlen = len("%s/" % os.path.dirname(root)) for path, dirs, files in os.walk(root): for fname in files: # ignoring filesnames which are hidden; # TODO : Should we ignore hidden filenames if fname[0] == ".": continue # TODO: This prevents redundant packaging of .py files for the # default card. We should fix this logic to allow .py files to # be included for custom cards. if any(fname.endswith(s) for s in [".html", ".js", ".css"]): p = os.path.join(path, fname) yield p, p[prefixlen:]
def _walk(self, root, filter_extensions=[], prefix_root=False): root = to_unicode(root) # handle files/folder with non ascii chars prfx = "%s/" % (root if prefix_root else os.path.dirname(root)) prefixlen = len(prfx) for path, dirs, files in os.walk(root): for fname in files: # ignoring filesnames which are hidden; # TODO : Should we ignore hidden filenames if fname[0] == ".": continue if len(filter_extensions) > 0 and not any( fname.endswith(s) for s in filter_extensions ): continue p = os.path.join(path, fname) yield p, p[prefixlen:]
def __init__(self, ds_class, flow_name, run_id, steps=None, pathspecs=None, metadata=None, event_logger=None, monitor=None, prefetch_data_artifacts=None): data_blobs = ds_class.get_latest_tasks(flow_name, run_id, steps=steps, pathspecs=pathspecs) artifact_cache = {} datastores = [ ds_class(flow_name, run_id=run_id, step_name=step_name, task_id=task_id, metadata=metadata, attempt=attempt, event_logger=event_logger, monitor=monitor, data_obj=json.loads(to_unicode(data_blob)), artifact_cache=artifact_cache) for step_name, task_id, attempt, data_blob in data_blobs ] if prefetch_data_artifacts: artifacts_to_prefetch = set([ ds.artifact_path(artifact_name) for ds in datastores for artifact_name in prefetch_data_artifacts if artifact_name in ds ]) # Update (and not re-assign) the artifact_cache since each datastore # created above has a reference to this object. artifact_cache.update( ds_class.get_artifacts(artifacts_to_prefetch)) self.pathspec_index_cache = {} self.pathspec_cache = {} for ds in datastores: self.pathspec_index_cache[ds.pathspec_index] = ds self.pathspec_cache[ds.pathspec] = ds
def sandbox(profile): overwrite_config(profile) # Prompt for user input. encoded_str = click.prompt('Following instructions from ' 'https://metaflow.org/sandbox, ' 'please paste the encoded magic string', type=str) # Decode the bytes to env_dict. try: import base64, zlib from metaflow.util import to_bytes env_dict =\ json.loads(to_unicode(zlib.decompress(base64.b64decode(to_bytes(encoded_str))))) except: # TODO: Add the URL for contact us page in the error? raise click.BadArgumentUsage('Could not decode the sandbox '\ 'configuration. Please contact us.') # Persist to a file. persist_env(env_dict, profile)
def loglines(self, stream, as_unicode=True): """ Return an iterator over (utc_timestamp, logline) tuples. If as_unicode=False, logline is returned as a byte object. Otherwise, it is returned as a (unicode) string. """ from metaflow.mflog.mflog import merge_logs from metaflow.mflog import LOG_SOURCES from metaflow.datastore import DATASTORES ds_type = self.metadata_dict.get('ds-type') ds_root = self.metadata_dict.get('ds-root') ds_cls = DATASTORES.get(ds_type, None) if ds_cls is None: raise MetaflowInternalError('Datastore %s was not found' % ds_type) ds_cls.datastore_root = ds_root # It is possible that a task fails before any metadata has been # recorded. In this case, we assume that we are executing the # first attempt. # # FIXME: Technically we are looking at the latest *recorded* attempt # here. It is possible that logs exists for a newer attempt that # just failed to record metadata. We could make this logic more robust # and guarantee that we always return the latest available log. ds = ds_cls(self._object['flow_id'], run_id=str(self._object['run_number']), step_name=self._object['step_name'], task_id=str(self._object['task_id']), mode='r', attempt=int(self.metadata_dict.get('attempt', 0)), allow_unsuccessful=True) logs = ds.load_logs(LOG_SOURCES, stream) for line in merge_logs([blob for _, blob in logs]): msg = to_unicode(line.msg) if as_unicode else line.msg yield line.utc_tstamp, msg
def echo(msg, stream="stderr", batch_id=None): msg = util.to_unicode(msg) if batch_id: msg = "[%s] %s" % (batch_id, msg) ctx.obj.echo_always(msg, err=(stream == sys.stderr))