def control_task_step_func(self, flow, graph, retry_count):
        from metaflow import current
        run_id = current.run_id
        step_name = current.step_name
        control_task_id = current.task_id
        (_, split_step_name, split_task_id) = control_task_id.split('-')[1:]
        # If we are running inside Conda, we use the base executable FIRST;
        # the conda environment will then be used when runtime_step_cli is
        # called. This is so that it can properly set up all the metaflow
        # aliases needed.
        env_to_use = getattr(self.environment, 'base_env', self.environment)
        executable = env_to_use.executable(step_name)
        script = sys.argv[0]

        # Access the `unbounded_foreach` param using `flow` (as datastore).
        assert(flow._unbounded_foreach)
        foreach_iter = flow.input
        if not isinstance(foreach_iter, InternalTestUnboundedForeachInput):
            raise MetaflowException('Expected type to be '\
                                    'InternalTestUnboundedForeachInput. Found %s'\
                                    % (type(foreach_iter)))
        foreach_num_splits = sum(1 for _ in foreach_iter)

        print('Simulating UnboundedForeach over value:',
              foreach_iter, 'num_splits:', foreach_num_splits)
        mapper_tasks = []

        for i in range(foreach_num_splits):
            task_id = \
                '%s-%d' % (control_task_id.replace('control-', 'test-ubf-'), i)
            pathspec = '%s/%s/%s' % (run_id, step_name, task_id)
            mapper_tasks.append(to_unicode(pathspec))
            input_paths = '%s/%s/%s' % (run_id, split_step_name, split_task_id)

            # Override specific `step` kwargs.
            kwargs = cli_args.step_kwargs
            kwargs['split_index'] = str(i)
            kwargs['run_id'] = run_id
            kwargs['task_id'] = task_id
            kwargs['input_paths'] = input_paths
            kwargs['ubf_context'] = UBF_TASK
            kwargs['retry_count'] = 0

            cmd = cli_args.step_command(executable, script, step_name,
                                        step_kwargs=kwargs)
            step_cli = u' '.join(cmd)
            # Print cmdline for execution. Doesn't work without the temporary
            # unicode object while using `print`.
            print(u'[${cwd}] Starting split#{split} with cmd:{cmd}'\
                  .format(cwd=os.getcwd(),
                          split=i,
                          cmd=step_cli))
            output_bytes = subprocess.check_output(cmd)
            output = to_unicode(output_bytes)
            for line in output.splitlines():
                print('[Split#%d] %s' % (i, line))
        # Save the list of (child) mapper task pathspec(s) into a designated
        # artifact `_control_mapper_tasks`.
        flow._control_mapper_tasks = mapper_tasks
Example #2
0
def resolve_workflow_name(obj, name):
    project = current.get("project_name")
    obj._is_workflow_name_modified = False
    if project:
        if name:
            raise MetaflowException(
                "--name is not supported for @projects. Use --branch instead.")
        workflow_name = current.project_flow_name
        project_branch = to_bytes(".".join((project, current.branch_name)))
        token_prefix = (
            "mfprj-%s" %
            to_unicode(base64.b32encode(sha1(project_branch).digest()))[:16])
        is_project = True
        # Argo Workflow names can't be longer than 253 characters, so we truncate
        # by default. Also, while project and branch allow for underscores, Argo
        # Workflows doesn't (DNS Subdomain names as defined in RFC 1123) - so we will
        # remove any underscores as well as convert the name to lower case.
        if len(workflow_name) > 253:
            name_hash = to_unicode(
                base64.b32encode(sha1(
                    to_bytes(workflow_name)).digest()))[:8].lower()
            workflow_name = "%s-%s" % (workflow_name[:242], name_hash)
            obj._is_workflow_name_modified = True
        if not VALID_NAME.search(workflow_name):
            workflow_name = (re.compile(r"^[^A-Za-z0-9]+").sub(
                "", workflow_name).replace("_", "").lower())
            obj._is_workflow_name_modified = True
    else:
        if name and not VALID_NAME.search(name):
            raise MetaflowException(
                "Name '%s' contains invalid characters. The "
                "name must consist of lower case alphanumeric characters, '-' or '.'"
                ", and must start and end with an alphanumeric character." %
                name)

        workflow_name = name if name else current.flow_name
        token_prefix = workflow_name
        is_project = False

        if len(workflow_name) > 253:
            msg = ("The full name of the workflow:\n*%s*\nis longer than 253 "
                   "characters.\n\n"
                   "To deploy this workflow to Argo Workflows, please "
                   "assign a shorter name\nusing the option\n"
                   "*argo-workflows --name <name> create*." % workflow_name)
            raise ArgoWorkflowsNameTooLong(msg)

        if not VALID_NAME.search(workflow_name):
            workflow_name = (re.compile(r"^[^A-Za-z0-9]+").sub(
                "", workflow_name).replace("_", "").lower())
            obj._is_workflow_name_modified = True

    return workflow_name, token_prefix.lower(), is_project
Example #3
0
def resolve_state_machine_name(obj, name):
    def attach_prefix(name):
        if SFN_STATE_MACHINE_PREFIX is not None:
            return SFN_STATE_MACHINE_PREFIX + "_" + name
        return name

    project = current.get("project_name")
    obj._is_state_machine_name_hashed = False
    if project:
        if name:
            raise MetaflowException(
                "--name is not supported for @projects. " "Use --branch instead."
            )
        state_machine_name = attach_prefix(current.project_flow_name)
        project_branch = to_bytes(".".join((project, current.branch_name)))
        token_prefix = (
            "mfprj-%s"
            % to_unicode(base64.b32encode(sha1(project_branch).digest()))[:16]
        )
        is_project = True
        # AWS Step Functions has a limit of 80 chars for state machine names.
        # We truncate the state machine name if the computed name is greater
        # than 60 chars and append a hashed suffix to ensure uniqueness.
        if len(state_machine_name) > 60:
            name_hash = to_unicode(
                base64.b32encode(sha1(to_bytes(state_machine_name)).digest())
            )[:16].lower()
            state_machine_name = "%s-%s" % (state_machine_name[:60], name_hash)
            obj._is_state_machine_name_hashed = True
    else:
        if name and VALID_NAME.search(name):
            raise MetaflowException("Name '%s' contains invalid characters." % name)

        state_machine_name = attach_prefix(name if name else current.flow_name)
        token_prefix = state_machine_name
        is_project = False

        if len(state_machine_name) > 80:
            msg = (
                "The full name of the workflow:\n*%s*\nis longer than 80 "
                "characters.\n\n"
                "To deploy this workflow to AWS Step Functions, please "
                "assign a shorter name\nusing the option\n"
                "*step-functions --name <name> create*." % state_machine_name
            )
            raise StepFunctionsStateMachineNameTooLong(msg)

    return state_machine_name, token_prefix.lower(), is_project
Example #4
0
 def print_all(tail):
     for line in tail:
         if line:
             echo(self.job.id, util.to_unicode(line))
         else:
             return tail, False
     return tail, True
Example #5
0
    def loglines(self, stream, as_unicode=True):
        """
        Return an iterator over (utc_timestamp, logline) tuples.

        If as_unicode=False, logline is returned as a byte object. Otherwise,
        it is returned as a (unicode) string.
        """
        from metaflow.mflog.mflog import merge_logs

        global filecache

        ds_type = self.metadata_dict.get("ds-type")
        ds_root = self.metadata_dict.get("ds-root")
        if ds_type is None or ds_root is None:
            yield None, ""
            return
        if filecache is None:
            filecache = FileCache()

        attempt = self.current_attempt
        logs = filecache.get_logs_stream(ds_type, ds_root, stream, attempt,
                                         *self.path_components)
        for line in merge_logs([blob for _, blob in logs]):
            msg = to_unicode(line.msg) if as_unicode else line.msg
            yield line.utc_tstamp, msg
Example #6
0
 def _options(mapping):
     for k, v in mapping.items():
         if v:
             k = k.replace("_", "-")
             v = v if isinstance(v, (list, tuple, set)) else [v]
             for value in v:
                 yield "--%s" % k
                 if not isinstance(value, bool):
                     yield to_unicode(value)
def parse(line):
    line = to_bytes(line)
    m = LINE_PARSER.match(to_bytes(line))
    if m:
        try:
            fields = list(m.groups())
            fields.append(datetime.strptime(to_unicode(fields[2]), ISOFORMAT))
            return MFLogline(*fields)
        except:
            pass
def format(name):
    # AWS Event Bridge has a limit of 64 chars for rule names.
    # We truncate the rule name if the computed name is greater
    # than 64 chars and append a hashed suffix to ensure uniqueness.
    if len(name) > 64:
        name_hash = to_unicode(base64.b32encode(sha1(
            to_bytes(name)).digest()))[:16].lower()
        # construct an 64 character long rule name
        return '%s-%s' % (name[:47], name_hash)
    else:
        return name
Example #9
0
 def _available_logs(tail, stream, echo, should_persist=False):
     # print the latest batch of lines
     try:
         for line in tail:
             if should_persist:
                 line = set_should_persist(line)
             else:
                 line = refine(line, prefix=prefix)
             echo(line.strip().decode("utf-8", errors="replace"), stream)
     except Exception as ex:
         echo(
             "%s[ temporary error in fetching logs: %s ]" %
             (to_unicode(prefix), ex),
             "stderr",
         )
Example #10
0
    def _walk(self, root):
        root = to_unicode(root)  # handle files/folder with non ascii chars
        prefixlen = len("%s/" % os.path.dirname(root))
        for path, dirs, files in os.walk(root):
            for fname in files:
                # ignoring filesnames which are hidden;
                # TODO : Should we ignore hidden filenames
                if fname[0] == ".":
                    continue

                # TODO: This prevents redundant packaging of .py files for the
                # default card. We should fix this logic to allow .py files to
                # be included for custom cards.
                if any(fname.endswith(s) for s in [".html", ".js", ".css"]):
                    p = os.path.join(path, fname)
                    yield p, p[prefixlen:]
Example #11
0
    def _walk(self, root, filter_extensions=[], prefix_root=False):
        root = to_unicode(root)  # handle files/folder with non ascii chars
        prfx = "%s/" % (root if prefix_root else os.path.dirname(root))
        prefixlen = len(prfx)
        for path, dirs, files in os.walk(root):
            for fname in files:
                # ignoring filesnames which are hidden;
                # TODO : Should we ignore hidden filenames
                if fname[0] == ".":
                    continue

                if len(filter_extensions) > 0 and not any(
                    fname.endswith(s) for s in filter_extensions
                ):
                    continue
                p = os.path.join(path, fname)
                yield p, p[prefixlen:]
Example #12
0
    def __init__(self,
                 ds_class,
                 flow_name,
                 run_id,
                 steps=None,
                 pathspecs=None,
                 metadata=None,
                 event_logger=None,
                 monitor=None,
                 prefetch_data_artifacts=None):
        data_blobs = ds_class.get_latest_tasks(flow_name,
                                               run_id,
                                               steps=steps,
                                               pathspecs=pathspecs)
        artifact_cache = {}
        datastores = [
            ds_class(flow_name,
                     run_id=run_id,
                     step_name=step_name,
                     task_id=task_id,
                     metadata=metadata,
                     attempt=attempt,
                     event_logger=event_logger,
                     monitor=monitor,
                     data_obj=json.loads(to_unicode(data_blob)),
                     artifact_cache=artifact_cache)
            for step_name, task_id, attempt, data_blob in data_blobs
        ]
        if prefetch_data_artifacts:
            artifacts_to_prefetch = set([
                ds.artifact_path(artifact_name) for ds in datastores
                for artifact_name in prefetch_data_artifacts
                if artifact_name in ds
            ])

            # Update (and not re-assign) the artifact_cache since each datastore
            # created above has a reference to this object.
            artifact_cache.update(
                ds_class.get_artifacts(artifacts_to_prefetch))
        self.pathspec_index_cache = {}
        self.pathspec_cache = {}
        for ds in datastores:
            self.pathspec_index_cache[ds.pathspec_index] = ds
            self.pathspec_cache[ds.pathspec] = ds
Example #13
0
def sandbox(profile):
    overwrite_config(profile)
    # Prompt for user input.
    encoded_str = click.prompt('Following instructions from '
                               'https://metaflow.org/sandbox, '
                               'please paste the encoded magic string',
                               type=str)
    # Decode the bytes to env_dict.
    try:
        import base64, zlib
        from metaflow.util import to_bytes
        env_dict =\
            json.loads(to_unicode(zlib.decompress(base64.b64decode(to_bytes(encoded_str)))))
    except:
        # TODO: Add the URL for contact us page in the error?
        raise click.BadArgumentUsage('Could not decode the sandbox '\
                                     'configuration. Please contact us.')
    # Persist to a file.
    persist_env(env_dict, profile)
Example #14
0
    def loglines(self, stream, as_unicode=True):
        """
        Return an iterator over (utc_timestamp, logline) tuples.

        If as_unicode=False, logline is returned as a byte object. Otherwise,
        it is returned as a (unicode) string.
        """
        from metaflow.mflog.mflog import merge_logs
        from metaflow.mflog import LOG_SOURCES
        from metaflow.datastore import DATASTORES

        ds_type = self.metadata_dict.get('ds-type')
        ds_root = self.metadata_dict.get('ds-root')

        ds_cls = DATASTORES.get(ds_type, None)
        if ds_cls is None:
            raise MetaflowInternalError('Datastore %s was not found' % ds_type)
        ds_cls.datastore_root = ds_root

        # It is possible that a task fails before any metadata has been
        # recorded. In this case, we assume that we are executing the
        # first attempt.
        #
        # FIXME: Technically we are looking at the latest *recorded* attempt
        # here. It is possible that logs exists for a newer attempt that
        # just failed to record metadata. We could make this logic more robust
        # and guarantee that we always return the latest available log.

        ds = ds_cls(self._object['flow_id'],
                    run_id=str(self._object['run_number']),
                    step_name=self._object['step_name'],
                    task_id=str(self._object['task_id']),
                    mode='r',
                    attempt=int(self.metadata_dict.get('attempt', 0)),
                    allow_unsuccessful=True)
        logs = ds.load_logs(LOG_SOURCES, stream)
        for line in merge_logs([blob for _, blob in logs]):
            msg = to_unicode(line.msg) if as_unicode else line.msg
            yield line.utc_tstamp, msg
Example #15
0
 def echo(msg, stream="stderr", batch_id=None):
     msg = util.to_unicode(msg)
     if batch_id:
         msg = "[%s] %s" % (batch_id, msg)
     ctx.obj.echo_always(msg, err=(stream == sys.stderr))