def refine(line, prefix=None, suffix=None): line = to_bytes(line) prefix = to_bytes(prefix) if prefix else b"" suffix = to_bytes(suffix) if suffix else b"" parts = line.split(b"]", 1) if len(parts) == 2: header, body = parts return b"".join((header, b"]", prefix, body, suffix)) else: return line
def decorate(source, line, version=VERSION, now=None, lineid=None): if now is None: now = datetime.utcnow() tstamp = to_bytes(now.strftime(ISOFORMAT)) if not lineid: lineid = to_bytes(str(uuid.uuid4())) line = to_bytes(line) source = to_bytes(source) return b"".join((b"[MFLOG|", version, b"|", tstamp, b"Z|", source, b"|", lineid, b"]", line))
def parse(line): line = to_bytes(line) m = LINE_PARSER.match(to_bytes(line)) if m: try: fields = list(m.groups()) fields.append(datetime.strptime(to_unicode(fields[2]), ISOFORMAT)) return MFLogline(*fields) except: pass
def refine(line, prefix=None, suffix=None): line = to_bytes(line) prefix = to_bytes(prefix) if prefix else b'' suffix = to_bytes(suffix) if suffix else b'' parts = line.split(b']', 1) if len(parts) == 2: header, body = parts return b''.join((header, b']', prefix, body, suffix)) else: return line
def resolve_workflow_name(obj, name): project = current.get("project_name") obj._is_workflow_name_modified = False if project: if name: raise MetaflowException( "--name is not supported for @projects. Use --branch instead.") workflow_name = current.project_flow_name project_branch = to_bytes(".".join((project, current.branch_name))) token_prefix = ( "mfprj-%s" % to_unicode(base64.b32encode(sha1(project_branch).digest()))[:16]) is_project = True # Argo Workflow names can't be longer than 253 characters, so we truncate # by default. Also, while project and branch allow for underscores, Argo # Workflows doesn't (DNS Subdomain names as defined in RFC 1123) - so we will # remove any underscores as well as convert the name to lower case. if len(workflow_name) > 253: name_hash = to_unicode( base64.b32encode(sha1( to_bytes(workflow_name)).digest()))[:8].lower() workflow_name = "%s-%s" % (workflow_name[:242], name_hash) obj._is_workflow_name_modified = True if not VALID_NAME.search(workflow_name): workflow_name = (re.compile(r"^[^A-Za-z0-9]+").sub( "", workflow_name).replace("_", "").lower()) obj._is_workflow_name_modified = True else: if name and not VALID_NAME.search(name): raise MetaflowException( "Name '%s' contains invalid characters. The " "name must consist of lower case alphanumeric characters, '-' or '.'" ", and must start and end with an alphanumeric character." % name) workflow_name = name if name else current.flow_name token_prefix = workflow_name is_project = False if len(workflow_name) > 253: msg = ("The full name of the workflow:\n*%s*\nis longer than 253 " "characters.\n\n" "To deploy this workflow to Argo Workflows, please " "assign a shorter name\nusing the option\n" "*argo-workflows --name <name> create*." % workflow_name) raise ArgoWorkflowsNameTooLong(msg) if not VALID_NAME.search(workflow_name): workflow_name = (re.compile(r"^[^A-Za-z0-9]+").sub( "", workflow_name).replace("_", "").lower()) obj._is_workflow_name_modified = True return workflow_name, token_prefix.lower(), is_project
def resolve_state_machine_name(obj, name): def attach_prefix(name): if SFN_STATE_MACHINE_PREFIX is not None: return SFN_STATE_MACHINE_PREFIX + "_" + name return name project = current.get("project_name") obj._is_state_machine_name_hashed = False if project: if name: raise MetaflowException( "--name is not supported for @projects. " "Use --branch instead." ) state_machine_name = attach_prefix(current.project_flow_name) project_branch = to_bytes(".".join((project, current.branch_name))) token_prefix = ( "mfprj-%s" % to_unicode(base64.b32encode(sha1(project_branch).digest()))[:16] ) is_project = True # AWS Step Functions has a limit of 80 chars for state machine names. # We truncate the state machine name if the computed name is greater # than 60 chars and append a hashed suffix to ensure uniqueness. if len(state_machine_name) > 60: name_hash = to_unicode( base64.b32encode(sha1(to_bytes(state_machine_name)).digest()) )[:16].lower() state_machine_name = "%s-%s" % (state_machine_name[:60], name_hash) obj._is_state_machine_name_hashed = True else: if name and VALID_NAME.search(name): raise MetaflowException("Name '%s' contains invalid characters." % name) state_machine_name = attach_prefix(name if name else current.flow_name) token_prefix = state_machine_name is_project = False if len(state_machine_name) > 80: msg = ( "The full name of the workflow:\n*%s*\nis longer than 80 " "characters.\n\n" "To deploy this workflow to AWS Step Functions, please " "assign a shorter name\nusing the option\n" "*step-functions --name <name> create*." % state_machine_name ) raise StepFunctionsStateMachineNameTooLong(msg) return state_machine_name, token_prefix.lower(), is_project
def wait(self, stdout_location, stderr_location, echo=None): def wait_for_launch(job): status = job.status echo( "Task is starting (status %s)..." % status, "stderr", batch_id=job.id, ) t = time.time() while True: if status != job.status or (time.time() - t) > 30: status = job.status echo( "Task is starting (status %s)..." % status, "stderr", batch_id=job.id, ) t = time.time() if job.is_running or job.is_done or job.is_crashed: break select.poll().poll(200) prefix = b"[%s] " % util.to_bytes(self.job.id) stdout_tail = S3Tail(stdout_location) stderr_tail = S3Tail(stderr_location) # 1) Loop until the job has started wait_for_launch(self.job) # 2) Tail logs until the job has finished tail_logs( prefix=prefix, stdout_tail=stdout_tail, stderr_tail=stderr_tail, echo=echo, has_log_updates=lambda: self.job.is_running, ) # In case of hard crashes (OOM), the final save_logs won't happen. # We can fetch the remaining logs from AWS CloudWatch and persist them # to Amazon S3. if self.job.is_crashed: msg = next(msg for msg in [ self.job.reason, self.job.status_reason, "Task crashed.", ] if msg is not None) raise BatchException("%s " "This could be a transient error. " "Use @retry to retry." % msg) else: if self.job.is_running: # Kill the job if it is still running by throwing an exception. raise BatchException("Task failed!") echo( "Task finished with exit code %s." % self.job.status_code, "stderr", batch_id=self.job.id, )
def test_put_one(s3root, objs, expected): with S3(s3root=s3root) as s3: for key, obj in objs: s3url = s3.put(key, obj) assert s3url in expected s3obj = s3.get(key) assert s3obj.key == key assert_results([s3obj], {s3url: expected[s3url]}) assert s3obj.blob == to_bytes(obj) # put with overwrite disabled s3url = s3.put(key, "random_value", overwrite=False) assert s3url in expected s3obj = s3.get(key) assert s3obj.key == key assert_results([s3obj], {s3url: expected[s3url]}) assert s3obj.blob == to_bytes(obj)
def _token_generator(token_prefix): for i in range(10000): prefix = "%s-%d-" % (token_prefix, i) # we need to use a consistent hash here, which is why # random.seed(prefix) or random.seed(hash(prefix)) won't work random.seed(zlib.adler32(to_bytes(prefix))) yield prefix + "".join(random.sample(string.ascii_lowercase, 4))
def set_should_persist(line): # this marker indicates that the logline should be persisted by # the receiver line = to_bytes(line) if is_structured(line) and not line.startswith(b"[!["): return b"[!" + line else: return line
def unset_should_persist(line): # prior to persisting, the should_persist marker should be removed # from the logline using this function line = to_bytes(line) if is_structured(line) and line.startswith(b"[!["): return line[2:] else: return line
def format(name): # AWS Event Bridge has a limit of 64 chars for rule names. # We truncate the rule name if the computed name is greater # than 64 chars and append a hashed suffix to ensure uniqueness. if len(name) > 64: name_hash = to_unicode(base64.b32encode(sha1( to_bytes(name)).digest()))[:16].lower() # construct an 64 character long rule name return '%s-%s' % (name[:47], name_hash) else: return name
def _print_available(tail, stream, should_persist=False): # print the latest batch of lines from S3Tail prefix = b"[%s] " % util.to_bytes(self._job.id) try: for line in tail: if should_persist: line = set_should_persist(line) else: line = refine(line, prefix=prefix) echo(line.strip().decode("utf-8", errors="replace"), stream) except Exception as ex: echo( "[ temporary error in fetching logs: %s ]" % ex, "stderr", job_id=self._job.id, )
def cmd(cmdline, input, output): for path, data in input.items(): with open(path, 'wb') as f: f.write(to_bytes(data)) if subprocess.call(cmdline, shell=True): raise ExternalCommandFailed("Command '%s' returned a non-zero " "exit code." % cmdline) out = [] for path in output: with open(path, 'rb') as f: out.append(f.read()) if len(out) == 1: return out[0] else: return out
def pytest_put_strings_case(meta=None): put_prefix = os.path.join(S3ROOT, PUT_PREFIX) data = [ u"unicode: \u523a\u8eab means sashimi", b"bytes: \x00\x01\x02", "just a string", ] expected = {} objs = [] for text in data: blob = to_bytes(text) checksum = sha1(blob).hexdigest() key = str(uuid4()) expected[os.path.join(put_prefix, key)] = { None: ExpectedResult( size=len(blob), checksum=checksum, content_type=None, metadata=None, range=None, ) } objs.append((key, text)) if meta is not None: for content_type, usermeta in meta.values(): key = str(uuid4()) expected[os.path.join(put_prefix, key)] = { None: ExpectedResult( size=len(blob), checksum=checksum, content_type=content_type, metadata=usermeta, range=None, ) } objs.append( S3PutObject( key=key, value=text, content_type=content_type, metadata=usermeta, ) ) return {"argvalues": [(put_prefix, objs, expected)], "ids": ["put_strings"]}
def sandbox(profile): overwrite_config(profile) # Prompt for user input. encoded_str = click.prompt('Following instructions from ' 'https://metaflow.org/sandbox, ' 'please paste the encoded magic string', type=str) # Decode the bytes to env_dict. try: import base64, zlib from metaflow.util import to_bytes env_dict =\ json.loads(to_unicode(zlib.decompress(base64.b64decode(to_bytes(encoded_str))))) except: # TODO: Add the URL for contact us page in the error? raise click.BadArgumentUsage('Could not decode the sandbox '\ 'configuration. Please contact us.') # Persist to a file. persist_env(env_dict, profile)
def wait(self, stdout_location, stderr_location, echo=None): def wait_for_launch(job): status = job.status echo('Task is starting (status %s)...' % status, 'stderr', batch_id=job.id) t = time.time() while True: if status != job.status or (time.time() - t) > 30: status = job.status echo('Task is starting (status %s)...' % status, 'stderr', batch_id=job.id) t = time.time() if job.is_running or job.is_done or job.is_crashed: break select.poll().poll(200) prefix = b'[%s] ' % util.to_bytes(self.job.id) def _print_available(tail, stream, should_persist=False): # print the latest batch of lines from S3Tail try: for line in tail: if should_persist: line = set_should_persist(line) else: line = refine(line, prefix=prefix) echo(line.strip().decode('utf-8', errors='replace'), stream) except Exception as ex: echo('[ temporary error in fetching logs: %s ]' % ex, 'stderr', batch_id=self.job.id) stdout_tail = S3Tail(stdout_location) stderr_tail = S3Tail(stderr_location) # 1) Loop until the job has started wait_for_launch(self.job) # 2) Loop until the job has finished start_time = time.time() is_running = True next_log_update = start_time log_update_delay = 1 while is_running: if time.time() > next_log_update: _print_available(stdout_tail, 'stdout') _print_available(stderr_tail, 'stderr') now = time.time() log_update_delay = update_delay(now - start_time) next_log_update = now + log_update_delay is_running = self.job.is_running # This sleep should never delay log updates. On the other hand, # we should exit this loop when the task has finished without # a long delay, regardless of the log tailing schedule d = min(log_update_delay, 5.0) select.poll().poll(d * 1000) # 3) Fetch remaining logs # # It is possible that we exit the loop above before all logs have been # shown. # # TODO if we notice AWS Batch failing to upload logs to S3, we can add a # HEAD request here to ensure that the file exists prior to calling # S3Tail and note the user about truncated logs if it doesn't _print_available(stdout_tail, 'stdout') _print_available(stderr_tail, 'stderr') # In case of hard crashes (OOM), the final save_logs won't happen. # We fetch the remaining logs from AWS CloudWatch and persist them to # Amazon S3. # # TODO: AWS CloudWatch fetch logs if self.job.is_crashed: if self.job.status_code == 1: msg = next(msg for msg in [ 'Task failed with a general error. You may have a miscellaneous error(s), ' 'such as "divide by zero" and other impermissible operations.' ] if msg is not None) if self.job.status_code == 2: msg = next(msg for msg in [ 'Misuse of shell builtins. You may have a missing keyword or command, ' 'or permission problem (and diff return code on a failed binary file ' 'comparison).' ] if msg is not None) if self.job.status_code == 126: msg = next(msg for msg in [ 'Command invoked cannot execute. May be a permission problem or command is ' 'not an executable.' ] if msg is not None) if self.job.status_code == 127: msg = next(msg for msg in [ 'Command not Found. May be a possible problem with $PATH or a type.' ] if msg is not None) if self.job.status_code == 128: msg = next(msg for msg in ['Invalid argument to exit.'] if msg is not None) if self.job.status_code > 129: msg = next(msg for msg in ['Signal related error'] if msg is not None) raise BatchException('%s ' 'This could be a transient error. ' 'Use @retry to retry.' % msg) else: if self.job.is_running: # Kill the job if it is still running by throwing an exception. raise BatchException("Task failed!") echo('Task finished with exit code %s.' % self.job.status_code, 'stderr', batch_id=self.job.id)
def is_structured(line): line = to_bytes(line) return line.startswith(b"[MFLOG|") or line.startswith(b"[![MFLOG|")
def wait(self, stdout_location, stderr_location, echo=None): def wait_for_launch(job): status = job.status echo( "Task is starting (Status %s)..." % status, "stderr", job_id=job.id, ) t = time.time() while True: new_status = job.status if status != new_status or (time.time() - t) > 30: status = new_status echo( "Task is starting (Status %s)..." % status, "stderr", job_id=job.id, ) t = time.time() if job.is_running or job.is_done: break time.sleep(1) prefix = b"[%s] " % util.to_bytes(self._job.id) stdout_tail = S3Tail(stdout_location) stderr_tail = S3Tail(stderr_location) # 1) Loop until the job has started wait_for_launch(self._job) # 2) Tail logs until the job has finished tail_logs( prefix=prefix, stdout_tail=stdout_tail, stderr_tail=stderr_tail, echo=echo, has_log_updates=lambda: self._job.is_running, ) # 3) Fetch remaining logs # # It is possible that we exit the loop above before all logs have been # shown. # # TODO (savin): If we notice Kubernetes failing to upload logs to S3, # we can add a HEAD request here to ensure that the file # exists prior to calling S3Tail and note the user about # truncated logs if it doesn't. # TODO (savin): For hard crashes, we can fetch logs from the pod. if self._job.has_failed: exit_code, reason = self._job.reason msg = next(msg for msg in [ reason, "Task crashed", ] if msg is not None) if exit_code: if int(exit_code) == 139: raise KubernetesException( "Task failed with a segmentation fault.") else: msg = "%s (exit code %s)" % (msg, exit_code) raise KubernetesException("%s. This could be a transient error. " "Use @retry to retry." % msg) exit_code, _ = self._job.reason echo( "Task finished with exit code %s." % exit_code, "stderr", job_id=self._job.id, )
def wait(self, stdout_location, stderr_location, echo=None): def wait_for_launch(job, child_jobs): status = job.status echo( "Task is starting (status %s)..." % status, "stderr", batch_id=job.id, ) t = time.time() while True: if status != job.status or (time.time() - t) > 30: if not child_jobs: child_statuses = "" else: status_keys = set( [child_job.status for child_job in child_jobs]) status_counts = [( status, len([ child_job.status == status for child_job in child_jobs ]), ) for status in status_keys] child_statuses = " (parallel node status: [{}])".format( ", ".join([ "{}:{}".format(status, num) for (status, num) in sorted(status_counts) ])) status = job.status echo( "Task is starting (status %s)... %s" % (status, child_statuses), "stderr", batch_id=job.id, ) t = time.time() if job.is_running or job.is_done or job.is_crashed: break select.poll().poll(200) prefix = b"[%s] " % util.to_bytes(self.job.id) stdout_tail = S3Tail(stdout_location) stderr_tail = S3Tail(stderr_location) child_jobs = [] if self.num_parallel > 1: for node in range(1, self.num_parallel): child_job = copy.copy(self.job) child_job._id = child_job._id + "#{}".format(node) child_jobs.append(child_job) # 1) Loop until the job has started wait_for_launch(self.job, child_jobs) # 2) Tail logs until the job has finished tail_logs( prefix=prefix, stdout_tail=stdout_tail, stderr_tail=stderr_tail, echo=echo, has_log_updates=lambda: self.job.is_running, ) # In case of hard crashes (OOM), the final save_logs won't happen. # We can fetch the remaining logs from AWS CloudWatch and persist them # to Amazon S3. if self.job.is_crashed: msg = next(msg for msg in [ self.job.reason, self.job.status_reason, "Task crashed.", ] if msg is not None) raise BatchException("%s " "This could be a transient error. " "Use @retry to retry." % msg) else: if self.job.is_running: # Kill the job if it is still running by throwing an exception. raise BatchException("Task failed!") echo( "Task finished with exit code %s." % self.job.status_code, "stderr", batch_id=self.job.id, )
def wait(self, stdout_location, stderr_location, echo=None): def update_delay(secs_since_start): # this sigmoid function reaches # - 0.1 after 11 minutes # - 0.5 after 15 minutes # - 1.0 after 23 minutes # in other words, the user will see very frequent updates # during the first 10 minutes sigmoid = 1.0 / (1.0 + math.exp(-0.01 * secs_since_start + 9.0)) return 0.5 + sigmoid * 30.0 def wait_for_launch(job): status = job.status echo( "Task is starting (%s)..." % status, "stderr", job_id=job.id, ) t = time.time() start_time = time.time() while job.is_waiting: new_status = job.status if status != new_status or (time.time() - t) > 30: status = new_status echo( "Task is starting (%s)..." % status, "stderr", job_id=job.id, ) t = time.time() time.sleep(update_delay(time.time() - start_time)) prefix = b"[%s] " % util.to_bytes(self._job.id) stdout_tail = S3Tail(stdout_location) stderr_tail = S3Tail(stderr_location) # 1) Loop until the job has started wait_for_launch(self._job) # 2) Tail logs until the job has finished tail_logs( prefix=prefix, stdout_tail=stdout_tail, stderr_tail=stderr_tail, echo=echo, has_log_updates=lambda: self._job.is_running, ) # 3) Fetch remaining logs # # It is possible that we exit the loop above before all logs have been # shown. # # TODO : If we notice Kubernetes failing to upload logs to S3, # we can add a HEAD request here to ensure that the file # exists prior to calling S3Tail and note the user about # truncated logs if it doesn't. # TODO : For hard crashes, we can fetch logs from the pod. if self._job.has_failed: exit_code, reason = self._job.reason msg = next(msg for msg in [ reason, "Task crashed", ] if msg is not None) if exit_code: if int(exit_code) == 139: raise KubernetesException( "Task failed with a segmentation fault.") if int(exit_code) == 137: raise KubernetesException( "Task ran out of memory. " "Increase the available memory by specifying " "@resource(memory=...) for the step. ") else: msg = "%s (exit code %s)" % (msg, exit_code) raise KubernetesException( "%s. This could be a transient error. Use @retry to retry." % msg) exit_code, _ = self._job.reason echo( "Task finished with exit code %s." % exit_code, "stderr", job_id=self._job.id, )
def wait(self, stdout_location, stderr_location, echo=None): def wait_for_launch(job): status = job.status echo( "Task is starting (status %s)..." % status, "stderr", batch_id=job.id, ) t = time.time() while True: if status != job.status or (time.time() - t) > 30: status = job.status echo( "Task is starting (status %s)..." % status, "stderr", batch_id=job.id, ) t = time.time() if job.is_running or job.is_done or job.is_crashed: break select.poll().poll(200) prefix = b"[%s] " % util.to_bytes(self.job.id) def _print_available(tail, stream, should_persist=False): # print the latest batch of lines from S3Tail try: for line in tail: if should_persist: line = set_should_persist(line) else: line = refine(line, prefix=prefix) echo(line.strip().decode("utf-8", errors="replace"), stream) except Exception as ex: echo( "[ temporary error in fetching logs: %s ]" % ex, "stderr", batch_id=self.job.id, ) stdout_tail = S3Tail(stdout_location) stderr_tail = S3Tail(stderr_location) # 1) Loop until the job has started wait_for_launch(self.job) # 2) Loop until the job has finished start_time = time.time() is_running = True next_log_update = start_time log_update_delay = 1 while is_running: if time.time() > next_log_update: _print_available(stdout_tail, "stdout") _print_available(stderr_tail, "stderr") now = time.time() log_update_delay = update_delay(now - start_time) next_log_update = now + log_update_delay is_running = self.job.is_running # This sleep should never delay log updates. On the other hand, # we should exit this loop when the task has finished without # a long delay, regardless of the log tailing schedule d = min(log_update_delay, 5.0) select.poll().poll(d * 1000) # 3) Fetch remaining logs # # It is possible that we exit the loop above before all logs have been # shown. # # TODO if we notice AWS Batch failing to upload logs to S3, we can add a # HEAD request here to ensure that the file exists prior to calling # S3Tail and note the user about truncated logs if it doesn't _print_available(stdout_tail, "stdout") _print_available(stderr_tail, "stderr") # In case of hard crashes (OOM), the final save_logs won't happen. # We fetch the remaining logs from AWS CloudWatch and persist them to # Amazon S3. if self.job.is_crashed: msg = next(msg for msg in [ self.job.reason, self.job.status_reason, "Task crashed.", ] if msg is not None) raise BatchException("%s " "This could be a transient error. " "Use @retry to retry." % msg) else: if self.job.is_running: # Kill the job if it is still running by throwing an exception. raise BatchException("Task failed!") echo( "Task finished with exit code %s." % self.job.status_code, "stderr", batch_id=self.job.id, )