def __init__(self, input_files, do_sort, mem_sort_limit): self.inputs = [] part = PART_SUFFIX % this_partition() for input in input_files: if input.startswith("dir://"): try: self.inputs += parse_dir(input, part_id = this_partition()) except: data_err("Couldn't resolve address %s"\ % input, input) else: self.inputs.append(input) self.line_count = 0 if do_sort: total_size = 0 for input in self.inputs: sze, fd = connect_input(input) total_size += sze msg("Reduce[%d] input is %.2fMB" %\ (this_partition(), total_size / 1024.0**2)) if total_size > mem_sort_limit: self.iterator = self.download_and_sort() else: msg("Sorting in memory") m = list(self.multi_file_iterator(self.inputs, False)) m.sort(num_cmp) self.iterator = self.list_iterator(m) else: self.iterator = self.multi_file_iterator(self.inputs)
def re_reader(item_re_str, fd, content_len, fname, output_tail = False, read_buffer_size=8192): item_re = re.compile(item_re_str) buf = "" tot = 0 while True: if content_len: r = fd.read(min(read_buffer_size, content_len - tot)) else: r = fd.read(read_buffer_size) tot += len(r) buf += r m = item_re.match(buf) while m: yield m.groups() buf = buf[m.end():] m = item_re.match(buf) if not len(r) or tot >= content_len: if content_len != None and tot < content_len: data_err("Truncated input (%s). "\ "Expected %d bytes, got %d" %\ (fname, content_len, tot), fname) if len(buf): if output_tail: yield [buf] else: msg("Couldn't match the last %d "\ "bytes in %s. Some bytes may be "\ "missing from input." %\ (len(buf), fname)) break
def __init__(self, input_files, do_sort, mem_sort_limit): self.inputs = [] part = PART_SUFFIX % this_partition() for input in input_files: if input.startswith("dir://"): self.inputs += [x for x in parse_dir(input)\ if x.startswith("chunk://") or\ x.endswith(part)] else: self.inputs.append(input) self.line_count = 0 if do_sort: total_size = 0 for input in self.inputs: sze, fd = connect_input(input) total_size += sze msg("Reduce[%d] input is %.2fMB" %\ (this_partition(), total_size / 1024.0**2)) if total_size > mem_sort_limit: self.iterator = self.download_and_sort() else: msg("Sorting in memory") m = list(self.multi_file_iterator(self.inputs, False)) m.sort(num_cmp) self.iterator = self.list_iterator(m) else: self.iterator = self.multi_file_iterator(self.inputs)
def list_iterator(self, lst): i = 0 for x in lst: yield x i += 1 if status_interval and not i % status_interval: msg("%d entries reduced" % i) msg("Reduce done: %d entries reduced in total" % i)
def op_map(job): global job_name job_input = this_inputs() msg("Received a new map job!") if len(job_input) != 1: err("Map can only handle one input. Got: %s" % " ".join(job_input)) nr_reduces = int(job['nr_reduces']) required_modules = job['required_modules'].split() fun_map_reader.func_code = marshal.loads(job['map_reader']) fun_map_writer.func_code = marshal.loads(job['map_writer']) fun_partition.func_code = marshal.loads(job['partition']) for m in required_modules: fun_map_reader.func_globals.setdefault(m, __import__(m)) fun_partition.func_globals.setdefault(m, __import__(m)) if 'ext_map' in job: if 'ext_params' in job: map_params = job['ext_params'] else: map_params = "0\n" external.prepare(job['ext_map'], map_params, EXT_MAP % job_name) fun_map.func_code = external.ext_map.func_code else: map_params = cPickle.loads(job['params']) fun_map.func_code = marshal.loads(job['map']) for m in required_modules: fun_map.func_globals.setdefault(m, __import__(m)) if 'map_init' in job: fun_init.func_code = marshal.loads(job['map_init']) if 'combiner' in job: fun_combiner.func_code = marshal.loads(job['combiner']) for m in required_modules: fun_combiner.func_globals.setdefault(m, __import__(m)) partitions = [MapOutput(i, map_params, fun_combiner)\ for i in range(nr_reduces)] else: partitions = [MapOutput(i, map_params) for i in range(nr_reduces)] run_map(job_input[0], partitions, map_params) for p in partitions: p.close() if 'chunked' in job: merge_chunks(partitions) out = "chunk://%s/%s/map-chunk-%d" %\ (this_host(), job_name, this_partition()) else: out = partitions[0].disco_address() external.close_ext() msg("%d %s" % (this_partition(), out), "OUT")
def op_map(job): msg("Received a new map job!") if len(Task.inputs) != 1: err("Map can only handle one input. Got: %s" % " ".join(Task.inputs)) global fun_reader, fun_writer, fun_partition fun_reader = util.unpack(job['map_reader'], globals=globals()) fun_writer = util.unpack(job['map_writer'], globals=globals()) fun_partition = util.unpack(job['partition'], globals=globals()) global fun_init if 'map_init' in job: fun_init = util.unpack(job['map_init'], globals=globals()) global fun_map if 'ext_map' in job: if 'ext_params' in job: map_params = job['ext_params'] else: map_params = "0\n" path = Task.path("EXT_MAP") external.prepare(job['ext_map'], map_params, path) fun_map = external.ext_map else: map_params = util.unpack(job['params'], globals=globals()) fun_map = util.unpack(job['map'], globals=globals()) global fun_combiner if 'combiner' in job: fun_combiner = util.unpack(job['combiner'], globals=globals()) init_common(job) nr_part = max(1, Task.num_partitions) if 'combiner' in job: partitions = [MapOutput(i, map_params, fun_combiner)\ for i in range(nr_part)] else: partitions = [MapOutput(i, map_params) for i in range(nr_part)] run_map(Task.inputs[0], partitions, map_params) external.close_ext() urls = {} for i, p in enumerate(partitions): p.close() urls["%d %s" % (i, p.url())] = True index, index_url = Task.map_index safe_update(index, urls) OutputURL(index_url)
def multi_file_iterator(self, inputs, progress = True, reader = fun_reduce_reader): i = 0 for fname in inputs: sze, fd = connect_input(fname) for x in reader(fd, sze, fname): yield x i += 1 if progress and status_interval and\ not i % status_interval: msg("%d entries reduced" % i) if progress: msg("Reduce done: %d entries reduced in total" % i)
def multi_file_iterator(self, inputs, params, progress = True, reader = fun_reader): i = 0 for url in inputs: fd, sze, url = connect_input(url, params) for x in reader(fd, sze, url): yield x i += 1 if progress and status_interval and\ not i % status_interval: msg("%d entries reduced" % i) if progress: msg("Reduce done: %d entries reduced in total" % i)
def download_and_sort(self, params): dlname = Task.path("REDUCE_DL", Task.id) msg("Reduce will be downloaded to %s" % dlname) out_fd = AtomicFile(dlname, "w") for url in self.inputs: fd, sze, url = connect_input(url, params) for k, v in fun_reader(fd, sze, url): if " " in k: err("Spaces are not allowed in keys "\ "with external sort.") if "\0" in v: err("Zero bytes are not allowed in "\ "values with external sort. "\ "Consider using base64 encoding.") out_fd.write("%s %s\0" % (k, v)) out_fd.close() msg("Reduce input downloaded ok") msg("Starting external sort") sortname = Task.path("REDUCE_SORTED", Task.id) ensure_path(os.path.dirname(sortname)) cmd = ["sort", "-n", "-k", "1,1", "-z",\ "-t", " ", "-o", sortname, dlname] proc = subprocess.Popen(cmd) ret = proc.wait() if ret: err("Sorting %s to %s failed (%d)" %\ (dlname, sortname, ret)) msg("External sort done: %s" % sortname) return self.multi_file_iterator([sortname], params, reader =\ lambda fd, sze, url:\ re_reader("(?s)(.*?) (.*?)\000", fd, sze, url))
def run_map(job_input, partitions, param): i = 0 fd, sze, url = connect_input(job_input, param) nr_reduces = max(1, Task.num_partitions) reader = fun_reader(fd, sze, url) fun_init(reader, param) for entry in reader: for key, value in fun_map(entry, param): p = fun_partition(key, nr_reduces, param) partitions[p].add(key, value) i += 1 if status_interval and not i % status_interval: msg("%d entries mapped" % i) msg("Done: %d entries mapped in total" % i)
def map(line, params): """ hackreduce:search:history format: None, timestamp, id, search, frequency? """ from datetime import datetime, timedelta from disco.util import msg time_grouping = 30 try: unknown, timestamp, uid, query, frequency = line.split("','") except ValueError: msg(line) # bad hack :-( time = timestamp.replace("'", "") date_obj = datetime.fromtimestamp(float(time[:-3])) # timestamp has milliseconds, shave em off nearest_minute = date_obj - timedelta( minutes=date_obj.minute % time_grouping, seconds=date_obj.second, microseconds=date_obj.microsecond) # Give a score if the words within each query are in any of the 4 lists. sex = ('cockrings', 'sex', 't**s', 'naked', 'girls', 'f**k', 'suck', 'teen', 'hot', 'cum', 'topless', 'nude', ) travel = ('fly', 'flight', 'plane', 'drive', 'europe', 'america', 'tours', 'map', 'hotel', 'cheap', 'asia', ) nerd = ('java ', 'c ', 'c++', 'php', 'visual basic', 'perl', 'python', 'c#', 'javascript', 'ruby', 'erlang', 'lisp', ) cooking = ('ice', 'cream', 'recipe', 'pasta', 'sauce', 'soup', 'meat', ) score = {'sex': 0, 'nerd': 0, 'travel': 0, 'cooking': 0} for word in query.split(): for key in score.keys(): score[key] += int(word.lower() in locals()[key]) yield (nearest_minute, score)
def map(line, params): """ hackreduce:search:history format: None, timestamp, id, search, frequency? """ from datetime import datetime, timedelta from disco.util import msg try: unknown, timestamp, uid, query, frequency = line.split("','") except ValueError: print msg(line) # bad hack :-( time = timestamp.replace("'", "") date_obj = datetime.fromtimestamp(float(time[:-3])) # timestamp has milliseconds, shave em off nearest_minute = date_obj - timedelta(minutes=date_obj.minute % 1, seconds=date_obj.second, microseconds=date_obj.microsecond) yield (nearest_minute, {'unique_id': uid, 'query': query, 'frequency': frequency})
def __init__(self, input_files, do_sort, mem_sort_limit, params): self.inputs = [url for input in input_files for url in util.urllist(input, partid=Task.id)] random.shuffle(self.inputs) self.line_count = 0 if do_sort: total_size = 0 for input in self.inputs: fd, sze, url = connect_input(input, params) total_size += sze msg("Reduce[%d] input is %.2fMB" %\ (Task.id, total_size / 1024.0**2)) if total_size > mem_sort_limit: self.iterator = self.download_and_sort(params) else: msg("Sorting in memory") m = list(self.multi_file_iterator(self.inputs, False)) m.sort(num_cmp) self.iterator = self.list_iterator(m) else: self.iterator = self.multi_file_iterator(self.inputs, params)
def op_reduce(job): job_inputs = this_inputs() msg("Received a new reduce job!") do_sort = int(job['sort']) mem_sort_limit = int(job['mem_sort_limit']) req_mod = job['required_modules'].split() if 'reduce_init' in job: fun_init.func_code = marshal.loads(job['reduce_init']) fun_reduce_reader.func_code = marshal.loads(job['reduce_reader']) fun_reduce_writer.func_code = marshal.loads(job['reduce_writer']) if 'required_files' in job: write_files(marshal.loads(job['required_files']), REQ_FILES) sys.path.insert(0, REQ_FILES) import_modules(req_mod, [fun_reduce_reader, fun_reduce_writer,\ fun_reduce, fun_init]) if 'ext_reduce' in job: if "ext_params" in job: red_params = job['ext_params'] else: red_params = "0\n" external.prepare(job['ext_reduce'], red_params, EXT_REDUCE) fun_reduce.func_code = external.ext_reduce.func_code else: fun_reduce.func_code = marshal.loads(job['reduce']) red_params = cPickle.loads(job['params']) red_in = ReduceReader(job_inputs, do_sort, mem_sort_limit).iter() red_out = ReduceOutput(red_params) msg("Starting reduce") fun_init(red_in, red_params) fun_reduce(red_in, red_out, red_params) msg("Reduce done") red_out.close() external.close_ext() index = cStringIO.StringIO(os.path.basename(red_out.fname) + "\n") safe_append(index, REDUCE_INDEX) msg("dir://%s/%sreduce-index.txt" % (this_host(), JOB_HOME), "OUT")
def op_reduce(job): global job_name job_inputs = this_inputs() msg("Received a new reduce job!") do_sort = int(job['sort']) mem_sort_limit = int(job['mem_sort_limit']) required_modules = job['required_modules'].split() if 'reduce_init' in job: fun_init.func_code = marshal.loads(job['reduce_init']) fun_reduce_reader.func_code = marshal.loads(job['reduce_reader']) fun_reduce_writer.func_code = marshal.loads(job['reduce_writer']) if 'ext_reduce' in job: if "ext_params" in job: red_params = job['ext_params'] else: red_params = "0\n" external.prepare(job['ext_reduce'], red_params, EXT_REDUCE % job_name) fun_reduce.func_code = external.ext_reduce.func_code else: fun_reduce.func_code = marshal.loads(job['reduce']) red_params = cPickle.loads(job['params']) for m in required_modules: fun_reduce.func_globals.setdefault(m, __import__(m)) red_in = ReduceReader(job_inputs, do_sort, mem_sort_limit).iter() red_out = ReduceOutput(red_params) msg("Starting reduce") fun_init(red_in, red_params) fun_reduce(red_in, red_out, red_params) msg("Reduce done") red_out.close() external.close_ext() msg("%d %s" % (this_partition(), red_out.disco_address()), "OUT")
def download_and_sort(self): dlname = REDUCE_DL % (job_name, this_partition()) ensure_path(dlname, False) msg("Reduce will be downloaded to %s" % dlname) out_fd = file(dlname + ".partial", "w") for fname in self.inputs: sze, fd = connect_input(fname) for k, v in fun_reduce_reader(fd, sze, fname): if " " in k: err("Spaces are not allowed in keys "\ "with external sort.") if "\0" in v: err("Zero bytes are not allowed in "\ "values with external sort. "\ "Consider using base64 encoding.") out_fd.write("%s %s\0" % (k, v)) out_fd.close() os.rename(dlname + ".partial", dlname) msg("Reduce input downloaded ok") msg("Starting external sort") sortname = REDUCE_SORTED % (job_name, this_partition()) ensure_path(sortname, False) cmd = ["sort", "-n", "-s", "-k", "1,1", "-z",\ "-t", " ", "-o", sortname, dlname] proc = subprocess.Popen(cmd) ret = proc.wait() if ret: err("Sorting %s to %s failed (%d)" %\ (dlname, sortname, ret)) msg("External sort done: %s" % sortname) return self.multi_file_iterator([sortname], reader =\ lambda fd, sze, fname:\ re_reader("(.*?) (.*?)\000", fd, sze, fname))
def op_reduce(job): msg("Received a new reduce job!") do_sort = int(job['sort']) mem_sort_limit = int(job['mem_sort_limit']) global fun_init if 'reduce_init' in job: fun_init = util.unpack(job['reduce_init'], globals=globals()) global fun_reader, fun_writer fun_reader = util.unpack(job['reduce_reader'], globals=globals()) fun_writer = util.unpack(job['reduce_writer'], globals=globals()) global fun_reduce if 'ext_reduce' in job: if "ext_params" in job: red_params = job['ext_params'] else: red_params = "0\n" path = Task.path("EXT_MAP") external.prepare(job['ext_reduce'], red_params, path) fun_reduce = external.ext_reduce else: fun_reduce = util.unpack(job['reduce'], globals=globals()) red_params = util.unpack(job['params'], globals=globals()) init_common(job) red_in = ReduceReader(Task.inputs, do_sort, mem_sort_limit, red_params).iter() red_out = ReduceOutput(red_params) msg("Starting reduce") fun_init(red_in, red_params) fun_reduce(red_in, red_out, red_params) msg("Reduce done") red_out.close() external.close_ext() index, index_url = Task.reduce_index safe_update(index, {"%d %s" % (Task.id, red_out.url()): True}) OutputURL(index_url)
def op_map(job): job_input = this_inputs() msg("Received a new map job!") if len(job_input) != 1: err("Map can only handle one input. Got: %s" % " ".join(job_input)) nr_reduces = int(job['nr_reduces']) nr_part = max(1, nr_reduces) fun_map_reader.func_code = marshal.loads(job['map_reader']) fun_map_writer.func_code = marshal.loads(job['map_writer']) fun_partition.func_code = marshal.loads(job['partition']) if 'map_init' in job: fun_init.func_code = marshal.loads(job['map_init']) if 'required_files' in job: write_files(marshal.loads(job['required_files']), REQ_FILES) sys.path.insert(0, REQ_FILES) req_mod = job['required_modules'].split() import_modules(req_mod, [fun_map_reader, fun_map_writer, fun_partition, fun_map, fun_combiner, fun_init]) if 'ext_map' in job: if 'ext_params' in job: map_params = job['ext_params'] else: map_params = "0\n" external.prepare(job['ext_map'], map_params, EXT_MAP) fun_map.func_code = external.ext_map.func_code else: map_params = cPickle.loads(job['params']) fun_map.func_code = marshal.loads(job['map']) if 'combiner' in job: fun_combiner.func_code = marshal.loads(job['combiner']) partitions = [MapOutput(i, map_params, fun_combiner)\ for i in range(nr_part)] else: partitions = [MapOutput(i, map_params) for i in range(nr_part)] run_map(job_input[0], partitions, map_params) external.close_ext() for p in partitions: p.close() if nr_reduces: merge_partitions(partitions) n = os.path.basename(PART_OUTPUT % 0) msg("dir://%s/%s%s:%d" % (this_host(), JOB_HOME, n, len(partitions) - 1), "OUT") else: res = [os.path.basename(p.fname) for p in partitions] index = cStringIO.StringIO("\n".join(res) + "\n") safe_append(index, MAP_INDEX) msg("dir://%s/%smap-index.txt" %\ (this_host(), JOB_HOME), "OUT")