def _(self): self.num_batch_done = 0 while True: try: for items in islice(f(self), self.num_batch_done, None): self.num_batch_done += 1 yield items if self.num_retry > 0: logger.info( "Fetch retry %d success for url %s, num_batch %d ", self.num_retry, self.url, self.num_batch_done) break except Exception as e: self.num_retry += 1 msg = "Fetch failed for url %s, tried %d/%d times. Exception: %s. " % ( self.url, self.num_retry, MAX_RETRY, e) fail_fast = False emsg = str(e) if not any( [emsg.find(s) >= 0 for s in [ "Connection refused", ]]): # ["many open file", "404"] fail_fast = True msg += "no need to retry." if fail_fast or self.num_retry >= MAX_RETRY: logger.warning(msg) from dpark.schedule import FetchFailed raise FetchFailed(self.uri, self.sid, self.mid, self.rid) else: sleep_time = RETRY_INTERVALS[self.num_retry - 1] msg += "sleep %d secs" % (sleep_time, ) logger.debug(msg) time.sleep(sleep_time)
def _(self): self.num_batch_done = 0 while True: try: for items in islice(f(self), self.num_batch_done, None): self.num_batch_done += 1 yield items if self.num_retry > 0: logger.info( "Fetch retry %d success for url %s, num_batch %d ", self.num_retry, self.url, self.num_batch_done) break except Exception as e: logger.exception("Fetch Fail") self.num_retry += 1 msg = "Fetch failed for url %s, tried %d/%d times. Exception: %r. " % ( self.url, self.num_retry, MAX_RETRY, e) fail_fast = False if isinstance(e, IOError) and str(e).find("many open file") >= 0: fail_fast = True if fail_fast or self.num_retry >= MAX_RETRY: msg += "GIVE UP!" logger.warning(msg) from dpark.schedule import FetchFailed raise FetchFailed(self.uri, self.sid, self.mid, self.rid) else: sleep_time = 2**self.num_retry * 0.5 # 0.5, 1.0, 2.0 msg += "sleep %d secs" % (sleep_time, ) logger.warning(msg) time.sleep(sleep_time)
def on_fail(self, e): self.num_retry += 1 msg = "Fetch failed for url %s, %d/%d. exception: %s. " % ( self.url, self.num_retry, self.max_retry, e) fail_fast = False if isinstance(e, IOError) and str(e).find("many open file") >= 0: fail_fast = True if fail_fast or self.num_retry >= self.max_retry: msg += "GIVE UP!" logger.warning(msg) from dpark.schedule import FetchFailed raise FetchFailed(self.uri, self.sid, self.mid, self.rid) else: logger.debug(msg) time.sleep(2**self.num_retry * 0.1)
def fetch_one(self, uri, shuffleId, part, reduceId): if uri == LocalFileShuffle.getServerUri(): # urllib can open local file url = 'file://' + LocalFileShuffle.getOutputFile( shuffleId, part, reduceId) else: url = "%s/%d/%d/%d" % (uri, shuffleId, part, reduceId) logger.debug("fetch %s", url) tries = 2 while True: try: f = urllib.request.urlopen(url) if f.code == 404: f.close() raise IOError("not found") d = f.read() flag = d[:1] length, = struct.unpack("I", d[1:5]) if length != len(d): raise ValueError( "length not match: expected %d, but got %d" % (length, len(d))) d = decompress(d[5:]) f.close() if flag == b'm': d = marshal.loads(d) elif flag == b'p': d = six.moves.cPickle.loads(d) else: raise ValueError("invalid flag") return d except Exception as e: logger.debug( "Fetch failed for shuffle %d," " reduce %d, %d, %s, %s, try again", shuffleId, reduceId, part, url, e) tries -= 1 if not tries: logger.warning( "Fetch failed for shuffle %d," " reduce %d, %d, %s, %s", shuffleId, reduceId, part, url, e) from dpark.schedule import FetchFailed raise FetchFailed(uri, shuffleId, part, reduceId) time.sleep(2**(2 - tries) * 0.1)