def test_strip_microseconds(): delta = timedelta(days=2, hours=1, minutes=3, seconds=20, milliseconds=123, microseconds=123) res = strip_microseconds(delta) assert res == timedelta(days=2, hours=1, minutes=3, seconds=20)
def is_emr_step_complete(self, log, cluster_id, emr_step_id): step = self.describe_step(cluster_id, emr_step_id)["Step"] step_state = EmrStepState(step["Status"]["State"]) if step_state == EmrStepState.Pending: cluster = self.describe_cluster(cluster_id)["Cluster"] reason = _get_reason(cluster) reason_desc = (": %s" % reason) if reason else "" log.info("PENDING (cluster is %s%s)" % (cluster["Status"]["State"], reason_desc)) return False elif step_state == EmrStepState.Running: time_running_desc = "" start = step["Status"]["Timeline"].get("StartDateTime") if start: time_running_desc = " for %s" % strip_microseconds(_boto3_now() - start) log.info("RUNNING%s" % time_running_desc) return False # we're done, will return at the end of this elif step_state == EmrStepState.Completed: log.info("COMPLETED") return True else: # step has failed somehow. *reason* seems to only be set # when job is cancelled (e.g. 'Job terminated') reason = _get_reason(step) reason_desc = (" (%s)" % reason) if reason else "" log.info("%s%s" % (step_state.value, reason_desc)) # print cluster status; this might give more context # why step didn't succeed cluster = self.describe_cluster(cluster_id)["Cluster"] reason = _get_reason(cluster) reason_desc = (": %s" % reason) if reason else "" log.info( "Cluster %s %s %s%s" % ( cluster["Id"], "was" if "ED" in cluster["Status"]["State"] else "is", cluster["Status"]["State"], reason_desc, ) ) if EmrClusterState(cluster["Status"]["State"]) in EMR_CLUSTER_TERMINATED_STATES: # was it caused by IAM roles? self._check_for_missing_default_iam_roles(log, cluster) # TODO: extract logs here to surface failure reason # See: https://github.com/dagster-io/dagster/issues/1954 if step_state == EmrStepState.Failed: log.info("EMR step %s failed" % emr_step_id) raise EmrError("EMR step failed")
def _wait_for_step_to_complete(self, context, cluster_id, step_id): '''Helper for _wait_for_steps_to_complete(). Wait for step with the given ID to complete, and fetch counters. If it fails, attempt to diagnose the error, and raise an exception. ''' check.str_param(cluster_id, 'cluster_id') check.str_param(step_id, 'step_id') emr_client = self.make_emr_client() while True: # don't antagonize EMR's throttling context.log.debug('Waiting %.1f seconds...' % self.check_cluster_every) time.sleep(self.check_cluster_every) step = emr_client.describe_step(ClusterId=cluster_id, StepId=step_id)['Step'] step_state = EmrStepState(step['Status']['State']) if step_state == EmrStepState.Pending: cluster = self.describe_cluster(cluster_id) reason = _get_reason(cluster) reason_desc = (': %s' % reason) if reason else '' context.log.info( 'PENDING (cluster is %s%s)' % (cluster['Status']['State'], reason_desc) ) continue elif step_state == EmrStepState.Running: time_running_desc = '' start = step['Status']['Timeline'].get('StartDateTime') if start: time_running_desc = ' for %s' % strip_microseconds(_boto3_now() - start) context.log.info('RUNNING%s' % time_running_desc) continue # we're done, will return at the end of this elif step_state == EmrStepState.Completed: context.log.info('COMPLETED') return else: # step has failed somehow. *reason* seems to only be set # when job is cancelled (e.g. 'Job terminated') reason = _get_reason(step) reason_desc = (' (%s)' % reason) if reason else '' context.log.info('%s%s' % (step_state.value, reason_desc)) # print cluster status; this might give more context # why step didn't succeed cluster = self.describe_cluster(cluster_id) reason = _get_reason(cluster) reason_desc = (': %s' % reason) if reason else '' context.log.info( 'Cluster %s %s %s%s' % ( cluster['Id'], 'was' if 'ED' in cluster['Status']['State'] else 'is', cluster['Status']['State'], reason_desc, ) ) if EmrClusterState(cluster['Status']['State']) in EMR_CLUSTER_TERMINATED_STATES: # was it caused by IAM roles? self._check_for_missing_default_iam_roles(context, cluster) # TODO: extract logs here to surface failure reason # See: https://github.com/dagster-io/dagster/issues/1954 if step_state == EmrStepState.Failed: context.log.info('Step %s failed' % step_id) raise Exception('step failed')
def is_emr_step_complete(self, log, cluster_id, emr_step_id): step = self.describe_step(cluster_id, emr_step_id)['Step'] step_state = EmrStepState(step['Status']['State']) if step_state == EmrStepState.Pending: cluster = self.describe_cluster(cluster_id)['Cluster'] reason = _get_reason(cluster) reason_desc = (': %s' % reason) if reason else '' log.info('PENDING (cluster is %s%s)' % (cluster['Status']['State'], reason_desc)) return False elif step_state == EmrStepState.Running: time_running_desc = '' start = step['Status']['Timeline'].get('StartDateTime') if start: time_running_desc = ' for %s' % strip_microseconds( _boto3_now() - start) log.info('RUNNING%s' % time_running_desc) return False # we're done, will return at the end of this elif step_state == EmrStepState.Completed: log.info('COMPLETED') return True else: # step has failed somehow. *reason* seems to only be set # when job is cancelled (e.g. 'Job terminated') reason = _get_reason(step) reason_desc = (' (%s)' % reason) if reason else '' log.info('%s%s' % (step_state.value, reason_desc)) # print cluster status; this might give more context # why step didn't succeed cluster = self.describe_cluster(cluster_id)['Cluster'] reason = _get_reason(cluster) reason_desc = (': %s' % reason) if reason else '' log.info('Cluster %s %s %s%s' % ( cluster['Id'], 'was' if 'ED' in cluster['Status']['State'] else 'is', cluster['Status']['State'], reason_desc, )) if EmrClusterState(cluster['Status'] ['State']) in EMR_CLUSTER_TERMINATED_STATES: # was it caused by IAM roles? self._check_for_missing_default_iam_roles(log, cluster) # TODO: extract logs here to surface failure reason # See: https://github.com/dagster-io/dagster/issues/1954 if step_state == EmrStepState.Failed: log.info('EMR step %s failed' % emr_step_id) raise EmrError('EMR step failed')
def _wait_for_emr_step_to_complete(self, log, cluster_id, emr_step_id): '''Helper for wait_for_steps_to_complete(). Wait for step with the given ID to complete. Args: cluster_id (str): The ID of the cluster emr_step_id (str): EMR Step ID to wait for Raises: EmrError: Raised when the step is marked by EMR as failed instead of completing successfully. ''' check.str_param(cluster_id, 'cluster_id') check.str_param(emr_step_id, 'emr_step_id') while True: # don't antagonize EMR's throttling log.debug('Waiting %.1f seconds...' % self.check_cluster_every) time.sleep(self.check_cluster_every) step = self.describe_step(cluster_id, emr_step_id)['Step'] step_state = EmrStepState(step['Status']['State']) if step_state == EmrStepState.Pending: cluster = self.describe_cluster(cluster_id)['Cluster'] reason = _get_reason(cluster) reason_desc = (': %s' % reason) if reason else '' log.info('PENDING (cluster is %s%s)' % (cluster['Status']['State'], reason_desc)) continue elif step_state == EmrStepState.Running: time_running_desc = '' start = step['Status']['Timeline'].get('StartDateTime') if start: time_running_desc = ' for %s' % strip_microseconds(_boto3_now() - start) log.info('RUNNING%s' % time_running_desc) continue # we're done, will return at the end of this elif step_state == EmrStepState.Completed: log.info('COMPLETED') return else: # step has failed somehow. *reason* seems to only be set # when job is cancelled (e.g. 'Job terminated') reason = _get_reason(step) reason_desc = (' (%s)' % reason) if reason else '' log.info('%s%s' % (step_state.value, reason_desc)) # print cluster status; this might give more context # why step didn't succeed cluster = self.describe_cluster(cluster_id)['Cluster'] reason = _get_reason(cluster) reason_desc = (': %s' % reason) if reason else '' log.info( 'Cluster %s %s %s%s' % ( cluster['Id'], 'was' if 'ED' in cluster['Status']['State'] else 'is', cluster['Status']['State'], reason_desc, ) ) if EmrClusterState(cluster['Status']['State']) in EMR_CLUSTER_TERMINATED_STATES: # was it caused by IAM roles? self._check_for_missing_default_iam_roles(log, cluster) # TODO: extract logs here to surface failure reason # See: https://github.com/dagster-io/dagster/issues/1954 if step_state == EmrStepState.Failed: log.info('EMR step %s failed' % emr_step_id) raise EmrError('EMR step failed')