def safe_append(instream, outfile, timeout = 60): outstream = file(outfile, "a") while timeout > 0: try: fcntl.flock(outstream, fcntl.LOCK_EX | fcntl.LOCK_NB) try: while True: buf = instream.read(8192) if not buf: instream.close() outstream.close() return outstream.write(buf) except Exception, x: # output file is inconsistent state # we must crash the job err("Updating file %s failed: %s" %\ (outfile, x)) except IOError, x: # Python doc guides us to check both the # EWOULDBLOCK (11) and EACCES (13) errors if x.errno == 11 or x.errno == 13: time.sleep(1) timeout -= 1 else: raise
def netstr_reader(fd, content_len, fname): if content_len == None: err("Content-length must be defined for netstr_reader") def read_netstr(idx, data, tot): ldata = len(data) i = 0 lenstr = "" if ldata - idx < 11: data = data[idx:] + fd.read(8192) ldata = len(data) idx = 0 i = data.find(" ", idx, idx + 11) if i == -1: err("Corrupted input (%s). Could not "\ "parse a value length at %d bytes."\ % (fname, tot)) else: lenstr = data[idx:i + 1] idx = i + 1 if ldata < i + 1: data_err("Truncated input (%s). "\ "Expected %d bytes, got %d" %\ (fname, content_len, tot), fname) try: llen = int(lenstr) except ValueError: err("Corrupted input (%s). Could not "\ "parse a value length at %d bytes."\ % (fname, tot)) tot += len(lenstr) if ldata - idx < llen + 1: data = data[idx:] + fd.read(llen + 8193) ldata = len(data) idx = 0 msg = data[idx:idx + llen] if idx + llen + 1 > ldata: data_err("Truncated input (%s). "\ "Expected a value of %d bytes "\ "(offset %u bytes)" %\ (fname, llen + 1, tot), fname) tot += llen + 1 idx += llen + 1 return idx, data, tot, msg data = fd.read(8192) tot = idx = 0 while tot < content_len: key = val = "" idx, data, tot, key = read_netstr(idx, data, tot) idx, data, tot, val = read_netstr(idx, data, tot) yield key, val
def op_map(job): global job_name job_input = this_inputs() msg("Received a new map job!") if len(job_input) != 1: err("Map can only handle one input. Got: %s" % " ".join(job_input)) nr_reduces = int(job['nr_reduces']) required_modules = job['required_modules'].split() fun_map_reader.func_code = marshal.loads(job['map_reader']) fun_map_writer.func_code = marshal.loads(job['map_writer']) fun_partition.func_code = marshal.loads(job['partition']) for m in required_modules: fun_map_reader.func_globals.setdefault(m, __import__(m)) fun_partition.func_globals.setdefault(m, __import__(m)) if 'ext_map' in job: if 'ext_params' in job: map_params = job['ext_params'] else: map_params = "0\n" external.prepare(job['ext_map'], map_params, EXT_MAP % job_name) fun_map.func_code = external.ext_map.func_code else: map_params = cPickle.loads(job['params']) fun_map.func_code = marshal.loads(job['map']) for m in required_modules: fun_map.func_globals.setdefault(m, __import__(m)) if 'map_init' in job: fun_init.func_code = marshal.loads(job['map_init']) if 'combiner' in job: fun_combiner.func_code = marshal.loads(job['combiner']) for m in required_modules: fun_combiner.func_globals.setdefault(m, __import__(m)) partitions = [MapOutput(i, map_params, fun_combiner)\ for i in range(nr_reduces)] else: partitions = [MapOutput(i, map_params) for i in range(nr_reduces)] run_map(job_input[0], partitions, map_params) for p in partitions: p.close() if 'chunked' in job: merge_chunks(partitions) out = "chunk://%s/%s/map-chunk-%d" %\ (this_host(), job_name, this_partition()) else: out = partitions[0].disco_address() external.close_ext() msg("%d %s" % (this_partition(), out), "OUT")
def write_files(ext_data, path): path = os.path.abspath(path) ensure_path(path + "/", False) for fname, data in ext_data.iteritems(): # make sure that no files are written outside the given path p = os.path.abspath(os.path.join(path, fname)) if os.path.dirname(p) == path: ensure_file(path + "/" + fname, data = data) else: err("Unsafe filename %s" % fname)
def op_map(job): msg("Received a new map job!") if len(Task.inputs) != 1: err("Map can only handle one input. Got: %s" % " ".join(Task.inputs)) global fun_reader, fun_writer, fun_partition fun_reader = util.unpack(job['map_reader'], globals=globals()) fun_writer = util.unpack(job['map_writer'], globals=globals()) fun_partition = util.unpack(job['partition'], globals=globals()) global fun_init if 'map_init' in job: fun_init = util.unpack(job['map_init'], globals=globals()) global fun_map if 'ext_map' in job: if 'ext_params' in job: map_params = job['ext_params'] else: map_params = "0\n" path = Task.path("EXT_MAP") external.prepare(job['ext_map'], map_params, path) fun_map = external.ext_map else: map_params = util.unpack(job['params'], globals=globals()) fun_map = util.unpack(job['map'], globals=globals()) global fun_combiner if 'combiner' in job: fun_combiner = util.unpack(job['combiner'], globals=globals()) init_common(job) nr_part = max(1, Task.num_partitions) if 'combiner' in job: partitions = [MapOutput(i, map_params, fun_combiner)\ for i in range(nr_part)] else: partitions = [MapOutput(i, map_params) for i in range(nr_part)] run_map(Task.inputs[0], partitions, map_params) external.close_ext() urls = {} for i, p in enumerate(partitions): p.close() urls["%d %s" % (i, p.url())] = True index, index_url = Task.map_index safe_update(index, urls) OutputURL(index_url)
def map_input_stream(stream, size, url, params): m = re.match("(\w+)://", url) if m: scheme = m.group(1) try: mod = __import__("disco.schemes.scheme_%s" % scheme, fromlist = ["scheme_%s" % scheme]) except Exception: err("Unknown scheme %s in %s" % (scheme, url)) else: from disco.schemes import scheme_file as mod url = "file://" + url mod.input_stream.func_globals.setdefault("Task", Task) return mod.input_stream(stream, size, url, params)
def read_netstr(idx, data, tot): ldata = len(data) i = 0 lenstr = "" if ldata - idx < 11: data = data[idx:] + fd.read(8192) ldata = len(data) idx = 0 i = data.find(" ", idx, idx + 11) if i == -1: err("Corrupted input (%s). Could not "\ "parse a value length at %d bytes."\ % (fname, tot)) else: lenstr = data[idx:i + 1] idx = i + 1 if ldata < i + 1: data_err("Truncated input (%s). "\ "Expected %d bytes, got %d" %\ (fname, content_len, tot), fname) try: llen = int(lenstr) except ValueError: err("Corrupted input (%s). Could not "\ "parse a value length at %d bytes."\ % (fname, tot)) tot += len(lenstr) if ldata - idx < llen + 1: data = data[idx:] + fd.read(llen + 8193) ldata = len(data) idx = 0 msg = data[idx:idx + llen] if idx + llen + 1 > ldata: data_err("Truncated input (%s). "\ "Expected a value of %d bytes "\ "(offset %u bytes)" %\ (fname, llen + 1, tot), fname) tot += llen + 1 idx += llen + 1 return idx, data, tot, msg
def ensure_path(path, check_exists = True): if check_exists and os.path.exists(path): err("File exists: %s" % path) if os.path.isfile(path): os.remove(path) dirpath, fname = os.path.split(path) try: os.makedirs(dirpath) except OSError, x: if x.errno == 17: # File exists is ok, it may happen # if two tasks are racing to create # the directory pass else: raise x
def _safe_fileop(op, mode, outfile, timeout): outstream = file(outfile, mode) while timeout > 0: try: fcntl.flock(outstream, fcntl.LOCK_EX | fcntl.LOCK_NB) try: r = op(outstream) outstream.close() return r except Exception, x: # output file is inconsistent state # we must crash the job err("Updating file %s failed: %s" %\ (outfile, x)) except IOError, x: # Python / BSD doc guides us to check for these errors if x.errno in (errno.EACCES, errno.EAGAIN, errno.EWOULDBLOCK): time.sleep(0.1) timeout -= 0.1 else: raise
def download_and_sort(self, params): dlname = Task.path("REDUCE_DL", Task.id) msg("Reduce will be downloaded to %s" % dlname) out_fd = AtomicFile(dlname, "w") for url in self.inputs: fd, sze, url = connect_input(url, params) for k, v in fun_reader(fd, sze, url): if " " in k: err("Spaces are not allowed in keys "\ "with external sort.") if "\0" in v: err("Zero bytes are not allowed in "\ "values with external sort. "\ "Consider using base64 encoding.") out_fd.write("%s %s\0" % (k, v)) out_fd.close() msg("Reduce input downloaded ok") msg("Starting external sort") sortname = Task.path("REDUCE_SORTED", Task.id) ensure_path(os.path.dirname(sortname)) cmd = ["sort", "-n", "-k", "1,1", "-z",\ "-t", " ", "-o", sortname, dlname] proc = subprocess.Popen(cmd) ret = proc.wait() if ret: err("Sorting %s to %s failed (%d)" %\ (dlname, sortname, ret)) msg("External sort done: %s" % sortname) return self.multi_file_iterator([sortname], params, reader =\ lambda fd, sze, url:\ re_reader("(?s)(.*?) (.*?)\000", fd, sze, url))
def download_and_sort(self): dlname = REDUCE_DL % (job_name, this_partition()) ensure_path(dlname, False) msg("Reduce will be downloaded to %s" % dlname) out_fd = file(dlname + ".partial", "w") for fname in self.inputs: sze, fd = connect_input(fname) for k, v in fun_reduce_reader(fd, sze, fname): if " " in k: err("Spaces are not allowed in keys "\ "with external sort.") if "\0" in v: err("Zero bytes are not allowed in "\ "values with external sort. "\ "Consider using base64 encoding.") out_fd.write("%s %s\0" % (k, v)) out_fd.close() os.rename(dlname + ".partial", dlname) msg("Reduce input downloaded ok") msg("Starting external sort") sortname = REDUCE_SORTED % (job_name, this_partition()) ensure_path(sortname, False) cmd = ["sort", "-n", "-s", "-k", "1,1", "-z",\ "-t", " ", "-o", sortname, dlname] proc = subprocess.Popen(cmd) ret = proc.wait() if ret: err("Sorting %s to %s failed (%d)" %\ (dlname, sortname, ret)) msg("External sort done: %s" % sortname) return self.multi_file_iterator([sortname], reader =\ lambda fd, sze, fname:\ re_reader("(.*?) (.*?)\000", fd, sze, fname))
def op_map(job): job_input = this_inputs() msg("Received a new map job!") if len(job_input) != 1: err("Map can only handle one input. Got: %s" % " ".join(job_input)) nr_reduces = int(job['nr_reduces']) nr_part = max(1, nr_reduces) fun_map_reader.func_code = marshal.loads(job['map_reader']) fun_map_writer.func_code = marshal.loads(job['map_writer']) fun_partition.func_code = marshal.loads(job['partition']) if 'map_init' in job: fun_init.func_code = marshal.loads(job['map_init']) if 'required_files' in job: write_files(marshal.loads(job['required_files']), REQ_FILES) sys.path.insert(0, REQ_FILES) req_mod = job['required_modules'].split() import_modules(req_mod, [fun_map_reader, fun_map_writer, fun_partition, fun_map, fun_combiner, fun_init]) if 'ext_map' in job: if 'ext_params' in job: map_params = job['ext_params'] else: map_params = "0\n" external.prepare(job['ext_map'], map_params, EXT_MAP) fun_map.func_code = external.ext_map.func_code else: map_params = cPickle.loads(job['params']) fun_map.func_code = marshal.loads(job['map']) if 'combiner' in job: fun_combiner.func_code = marshal.loads(job['combiner']) partitions = [MapOutput(i, map_params, fun_combiner)\ for i in range(nr_part)] else: partitions = [MapOutput(i, map_params) for i in range(nr_part)] run_map(job_input[0], partitions, map_params) external.close_ext() for p in partitions: p.close() if nr_reduces: merge_partitions(partitions) n = os.path.basename(PART_OUTPUT % 0) msg("dir://%s/%s%s:%d" % (this_host(), JOB_HOME, n, len(partitions) - 1), "OUT") else: res = [os.path.basename(p.fname) for p in partitions] index = cStringIO.StringIO("\n".join(res) + "\n") safe_append(index, MAP_INDEX) msg("dir://%s/%smap-index.txt" %\ (this_host(), JOB_HOME), "OUT")