def __call__(self, *args, **kwargs):
        """Call the wrapped function, with retries.

        Args:
           retry_timedelta (kwarg): amount of time to retry before giving up.
           sleep_base (kwarg): amount of time to sleep upon first failure, all other sleeps
               are derived from this one.
        """

        retry_timedelta = kwargs.pop('retry_timedelta', self._retry_timedelta)
        if retry_timedelta is None:
            retry_timedelta = datetime.timedelta(days=365)

        num_retries = kwargs.pop('num_retries', self._num_retries)
        if num_retries is None:
            num_retries = 1000000

        if os.environ.get('WANDB_TEST'):
            num_retries = 0

        sleep_base = kwargs.pop('retry_sleep_base', 1)

        # an extra function to allow performing more logic on the filtered exceptiosn
        check_retry_fn = kwargs.pop('check_retry_fn', self._check_retry_fn)

        sleep = sleep_base
        start_time = datetime.datetime.now()
        now = start_time

        self._num_iter = 0

        while True:
            try:
                result = self._call_fn(*args, **kwargs)
                # Only print resolved attempts once every minute
                if self._num_iter > 2 and now - self._last_print > datetime.timedelta(
                        minutes=1):
                    self._last_print = datetime.datetime.now()
                    wandb.termlog(
                        '{} resolved after {}, resuming normal operation.'.
                        format(self._error_prefix,
                               datetime.datetime.now() - start_time))
                return result
            except self._retryable_exceptions as e:
                # if the secondary check fails, re-raise
                if not check_retry_fn(e):
                    raise
                if (datetime.datetime.now() - start_time >= retry_timedelta
                        or self._num_iter >= num_retries):
                    raise
                if self._num_iter == 2:
                    logger.exception('Retry attempt failed:')
                    wandb.termlog(
                        '{} ({}), entering retry loop. See {} for full traceback.'
                        .format(self._error_prefix, e.__class__.__name__,
                                util.get_log_file_path()))
                # if wandb.env.is_debug():
                #     traceback.print_exc()
            time.sleep(sleep + random.random() * 0.25 * sleep)
            sleep *= 2
            if sleep > self.MAX_SLEEP_SECONDS:
                sleep = self.MAX_SLEEP_SECONDS
            now = datetime.datetime.now()

            self._num_iter += 1
Exemple #2
0
 def log_fname(self):
     # TODO: we started work to log to a file in the run dir, but it had issues.
     # For now all logs goto the same place.
     return util.get_log_file_path()
Exemple #3
0
    def __call__(self, *args, **kwargs):
        """Call the wrapped function, with retries.

        Args:
           retry_timedelta (kwarg): amount of time to retry before giving up.
           sleep_base (kwarg): amount of time to sleep upon first failure, all other sleeps
               are derived from this one.
        """

        retry_timedelta = kwargs.pop('retry_timedelta', self._retry_timedelta)
        if retry_timedelta is None:
            retry_timedelta = datetime.timedelta(days=1000000)

        num_retries = kwargs.pop('num_retries', self._num_retries)
        if num_retries is None:
            num_retries = 1000000

        if os.environ.get('WANDB_TEST'):
            num_retries = 0

        sleep_base = 1
        try:
            sleep_base = kwargs.pop('retry_sleep_base')
        except KeyError:
            pass

        first = True
        sleep = sleep_base
        start_time = datetime.datetime.now()
        now = start_time

        self._num_iter = 0

        while True:
            try:
                result = self._call_fn(*args, **kwargs)
                if not first:
                    wandb.termlog(
                        '%s resolved after %s, resuming normal operation.' %
                        (self._error_prefix,
                         datetime.datetime.now() - start_time))
                return result
            except self._retryable_exceptions as e:
                if (datetime.datetime.now() - start_time >= retry_timedelta
                        or self._num_iter >= num_retries):
                    raise
                if self._num_iter == 2:
                    logger.exception('Retry attempt failed:')
                    wandb.termlog(
                        '%s (%s), entering retry loop. See %s for full traceback.'
                        % (self._error_prefix, e.__class__.__name__,
                           util.get_log_file_path()))
                if os.getenv('WANDB_DEBUG'):
                    traceback.print_exc()
            first = False
            time.sleep(sleep + random.random() * 0.25 * sleep)
            sleep *= 2
            if sleep > self.MAX_SLEEP_SECONDS:
                sleep = self.MAX_SLEEP_SECONDS
            now = datetime.datetime.now()

            self._num_iter += 1