def starter(program): from dumbo.backends import get_backend def require(a): var = program.delopt(a) assert var, a return var original_parse_file = require("original_parse") original_ref_file = require("original_ref") rule_file = require("rule_file") shell = ShellRunner({}) shell.subs["CHINESE"] = original_parse_file shell.subs["ENGLISH"] = original_ref_file shell.subs["P_DIR"] = os.getenv("TRANSFOREST") shell.subs["PYTHON"] = os.getenv("PYTHON") shell.subs["TMPDIR"] = os.getenv("TMPDIR") shell.call("$PYTHON $P_DIR/hadoop/zip.py $CHINESE $ENGLISH > $TMPDIR/combined.txt") backend = get_backend(program.opts) fs = backend.create_filesystem(program.opts) fs.put(shell.complete("$TMPDIR/combined.txt"), "combined.txt", program.opts) program.addopt("input","combined.txt") program.addopt("cmdenv","RULE_FILE=%s"%rule_file) program.addopt("cmdenv","PYTHON=%s"%os.getenv("PYTHON")) program.addopt("cmdenv","TRANSFOREST=%s"%os.getenv("TRANSFOREST")) program.addopt("cmdenv","LD_LIBRARY_PATH=%s"%os.getenv("LD_LIBRARY_PATH")) program.addopt("cmdenv","PYTHONPATH=%s"%os.getenv("PYTHONPATH"))
def run(self): if len(sys.argv) > 1 and not sys.argv[1][0] == '-': iterarg = 0 # default value if len(sys.argv) > 2: iterarg = int(sys.argv[2]) # for loop isn't necessary but helps for error reporting apparently for args, kwargs in self.iters[iterarg:iterarg+1]: kwargs['iter'] = iterarg run(*args, **kwargs) else: for _iter, (args, kwargs) in enumerate(self.iters): kwargs['iter'] = _iter opts = Options(kwargs.get('opts', [])) opts += parseargs(sys.argv[1:]) # this has to be done early, while all the opts are still there backend = get_backend(opts) fs = backend.create_filesystem(opts) preoutputsopt = opts.pop('preoutputs') delinputsopt = opts.pop('delinputs') job_inputs = opts['input'] if not job_inputs: print >> sys.stderr, 'ERROR: No input path specified' sys.exit(1) outputopt = opts['output'] if not outputopt: print >> sys.stderr, 'ERROR: No output path specified' sys.exit(1) job_output = outputopt[0] newopts = Options() newopts.add('iteration', str(_iter)) newopts.add('itercount', str(len(self.iters))) _input = kwargs['input'] if type(_input) == int: _input = [_input] if _input == [-1]: kwargs['input'] = job_inputs delinputs = 'yes' if 'yes' in delinputsopt and _iter == self.deps[-1] else 'no' newopts.add('delinputs', delinputs) else: if -1 in _input: print >> sys.stderr, 'ERROR: Cannot mix job input with intermediate results' sys.exit(1) kwargs['input'] = [job_output + "_pre" + str(initer + 1) for initer in _input] newopts.add('inputformat', 'code') if 'yes' in opts['addpath']: # not when == 'iter' newopts.add('addpath', 'no') newopts.add('delinputs', 'no') if _iter == len(self.iters) - 1: kwargs['output'] = job_output else: kwargs['output'] = job_output + "_pre" + str(_iter + 1) newopts.add('outputformat', 'code') if 'yes' in opts['getpath']: # not when == 'iter' newopts.add('getpath', 'no') keys = [k for k, _ in opts if k in newopts] opts.remove(*keys) opts += newopts kwargs['opts'] = opts run(*args, **kwargs) if 'yes' not in preoutputsopt and _input != [-1]: for initer in _input: if _iter == self.deps[initer]: fs.rm(job_output + "_pre" + str(initer + 1), opts)
def run(mapper, reducer=None, combiner=None, buffersize=None, mapconf=None, redconf=None, combconf=None, mapclose=None, redclose=None, combclose=None, mapcleanup=None, redcleanup=None, combcleanup=None, opts=None, input=None, output=None, iter=0): if len(sys.argv) > 1 and not sys.argv[1][0] == '-': iterarg = 0 # default value if len(sys.argv) > 2: iterarg = int(sys.argv[2]) memlim = None # memory limit if len(sys.argv) > 3: memlim = int(sys.argv[3]) resource.setrlimit(resource.RLIMIT_AS, (memlim, memlim)) mrbase_class = loadclassname(os.environ['dumbo_mrbase_class']) jk_class = loadclassname(os.environ['dumbo_jk_class']) runinfo = loadclassname(os.environ['dumbo_runinfo_class'])() if iterarg == iter: if sys.argv[1].startswith('map'): if type(mapper) in (types.ClassType, type): mappercls = type('DumboMapper', (mapper, mrbase_class), {}) mapper = mappercls() if hasattr(mapper, 'configure'): mapconf = mapper.configure if hasattr(mapper, 'close'): mapclose = mapper.close if hasattr(mapper, 'map'): mapper = mapper.map if hasattr(mapper, 'cleanup'): mapcleanup = mapper.cleanup if type(combiner) in (types.ClassType, type): combinercls = type('DumboCombiner', (combiner, mrbase_class), {}) combiner = combinercls() if hasattr(combiner, 'configure'): combconf = combiner.configure if hasattr(combiner, 'close'): combclose = combiner.close if hasattr(combiner, 'reduce'): combiner = combiner.reduce if hasattr(combiner, 'cleanup'): combcleanup = combiner.cleanup try: print >> sys.stderr, "INFO: consuming %s" % \ os.environ['map_input_file'] except KeyError: pass if os.environ.has_key('stream_map_input') and \ os.environ['stream_map_input'].lower() == 'typedbytes': print >> sys.stderr, "INFO: inputting typed bytes" try: import ctypedbytes as typedbytes except ImportError: import typedbytes inputs = typedbytes.PairedInput(sys.stdin).reads() else: inputs = loadcode(line[:-1] for line in sys.stdin) if mapconf: mapconf() if combconf: combconf() if os.environ.has_key('dumbo_addpath'): path = runinfo.get_input_path() inputs = (((path, k), v) for (k, v) in inputs) if os.environ.has_key('dumbo_joinkeys'): inputs = ((jk_class(k), v) for (k, v) in inputs) if os.environ.has_key('dumbo_parser'): parser = os.environ['dumbo_parser'] clsname = parser.split('.')[-1] modname = '.'.join(parser.split('.')[:-1]) if not modname: raise ImportError(parser) module = __import__(modname, fromlist=[clsname]) parse = getattr(module, clsname)().parse outputs = itermap(inputs, mapper, parse) elif os.environ.has_key('dumbo_record'): record = os.environ['dumbo_record'] clsname = record.split('.')[-1] modname = '.'.join(record.split('.')[:-1]) if not modname: raise ImportError(parser) module = __import__(modname, fromlist=[clsname]) set = getattr(module, clsname)().set outputs = itermap(inputs, mapper, lambda v: set(*v)) else: outputs = itermap(inputs, mapper) if mapcleanup: outputs = chain(outputs, mapcleanup()) # Combiner if combiner and type(combiner) != str: if (not buffersize) and memlim: buffersize = int(memlim * 0.33) / 512 # educated guess print >> sys.stderr, 'INFO: buffersize =', buffersize inputs = sorted(outputs, buffersize) if os.environ.has_key('dumbo_joinkeys'): outputs = iterreduce(inputs, combiner, keyfunc=jk_class.fromjoinkey) else: outputs = iterreduce(inputs, combiner) if os.environ.has_key('dumbo_joinkeys'): outputs = ((jk.dump(), v) for (jk, v) in outputs) if combcleanup: outputs = chain(outputs, combcleanup()) if os.environ.has_key('stream_map_output') and \ os.environ['stream_map_output'].lower() == 'typedbytes': print >> sys.stderr, "INFO: outputting typed bytes" try: import ctypedbytes as typedbytes except ImportError: import typedbytes typedbytes.PairedOutput(sys.stdout).writes(outputs) else: for output in dumpcode(outputs): print '\t'.join(output) if combclose: combclose() if mapclose: mapclose() elif reducer: # Reducer if type(reducer) in (types.ClassType, type): reducercls = type('DumboReducer', (reducer, mrbase_class), {}) reducer = reducercls() if hasattr(reducer, 'configure'): redconf = reducer.configure if hasattr(reducer, 'close'): redclose = reducer.close if hasattr(reducer, 'reduce'): reducer = reducer.reduce if hasattr(reducer, 'cleanup'): redcleanup = reducer.cleanup if os.environ.has_key('stream_reduce_input') and \ os.environ['stream_reduce_input'].lower() == 'typedbytes': print >> sys.stderr, "INFO: inputting typed bytes" try: import ctypedbytes as typedbytes except ImportError: import typedbytes inputs = typedbytes.PairedInput(sys.stdin).reads() else: inputs = loadcode(line[:-1] for line in sys.stdin) if redconf: redconf() if os.environ.has_key('dumbo_joinkeys'): outputs = iterreduce(inputs, reducer, keyfunc=jk_class.fromdump) outputs = ((jk.body, v) for (jk, v) in outputs) else: outputs = iterreduce(inputs, reducer) if redcleanup: outputs = chain(outputs, redcleanup()) if os.environ.has_key('stream_reduce_output') and \ os.environ['stream_reduce_output'].lower() == 'typedbytes': print >> sys.stderr, "INFO: outputting typed bytes" try: import ctypedbytes as typedbytes except ImportError: import typedbytes typedbytes.PairedOutput(sys.stdout).writes(outputs) else: for output in dumpcode(outputs): print '\t'.join(output) if redclose: redclose() else: for output in dumpcode(inputs): print '\t'.join(output) else: opts = Options(opts) if type(mapper) == str: opts.add('mapper', mapper) elif hasattr(mapper, 'opts'): opts += mapper.opts if type(reducer) == str: opts.add('reducer', reducer) elif hasattr(reducer, 'opts'): opts += reducer.opts if type(combiner) == str: opts.add('combiner', combiner) opts += parseargs(sys.argv[1:]) if input is not None: opts.remove('input') for infile in input: opts.add('input', infile) if output is None: outputopt = opts['output'] if not outputopt: print >> sys.stderr, 'ERROR: No output path specified' sys.exit(1) output = outputopt[0] newopts = Options() newopts.add('output', output) if not reducer: newopts.add('numreducetasks', '0') keys = [k for k, _ in opts if k in newopts] opts.remove(*keys) opts += newopts backend = get_backend(opts) overwriteopt = opts.pop('overwrite') checkoutput = 'no' not in opts.pop('checkoutput') fs = backend.create_filesystem(opts) if 'yes' in overwriteopt: fs.rm(output, opts) elif checkoutput and fs.exists(output, opts) == 0: print >> sys.stderr, 'ERROR: Output path exists already: %s' % output sys.exit(1) opts.add('cmdenv', 'dumbo_mrbase_class=' + \ getclassname(backend.get_mapredbase_class(opts))) opts.add('cmdenv', 'dumbo_jk_class=' + \ getclassname(backend.get_joinkey_class(opts))) opts.add('cmdenv', 'dumbo_runinfo_class=' + \ getclassname(backend.get_runinfo_class(opts))) retval = backend.create_iteration(opts).run() if retval == 127: print >> sys.stderr, 'ERROR: Are you sure that "python" is on your path?' if retval != 0: sys.exit(retval)
def run(self): if len(sys.argv) > 1 and not sys.argv[1][0] == '-': iterarg = 0 # default value if len(sys.argv) > 2: iterarg = int(sys.argv[2]) # for loop isn't necessary but helps for error reporting apparently for args, kwargs in self.iters[iterarg:iterarg + 1]: kwargs['iter'] = iterarg run(*args, **kwargs) else: for _iter, (args, kwargs) in enumerate(self.iters): kwargs['iter'] = _iter opts = Options(kwargs.get('opts', [])) opts += parseargs(sys.argv[1:]) # this has to be done early, while all the opts are still there backend = get_backend(opts) fs = backend.create_filesystem(opts) preoutputsopt = opts.pop('preoutputs') delinputsopt = opts.pop('delinputs') job_inputs = opts['input'] if not job_inputs: print >> sys.stderr, 'ERROR: No input path specified' sys.exit(1) outputopt = opts['output'] if not outputopt: print >> sys.stderr, 'ERROR: No output path specified' sys.exit(1) job_output = outputopt[0] newopts = Options() newopts.add('iteration', str(_iter)) newopts.add('itercount', str(len(self.iters))) _input = kwargs['input'] if type(_input) == int: _input = [_input] if _input == [-1]: kwargs['input'] = job_inputs delinputs = 'yes' if 'yes' in delinputsopt and _iter == self.deps[ -1] else 'no' newopts.add('delinputs', delinputs) else: if -1 in _input: print >> sys.stderr, 'ERROR: Cannot mix job input with intermediate results' sys.exit(1) kwargs['input'] = [ job_output + "_pre" + str(initer + 1) for initer in _input ] newopts.add('inputformat', 'code') if 'yes' in opts['addpath']: # not when == 'iter' newopts.add('addpath', 'no') newopts.add('delinputs', 'no') if _iter == len(self.iters) - 1: kwargs['output'] = job_output else: kwargs['output'] = job_output + "_pre" + str(_iter + 1) newopts.add('outputformat', 'code') if 'yes' in opts['getpath']: # not when == 'iter' newopts.add('getpath', 'no') keys = [k for k, _ in opts if k in newopts] opts.remove(*keys) opts += newopts kwargs['opts'] = opts run(*args, **kwargs) if 'yes' not in preoutputsopt and _input != [-1]: for initer in _input: if _iter == self.deps[initer]: fs.rm(job_output + "_pre" + str(initer + 1), opts)
def run(mapper, reducer=None, combiner=None, buffersize=None, mapconf=None, redconf=None, combconf=None, mapclose=None, redclose=None, combclose=None, opts=None, input=None, output=None, iter=0): if len(sys.argv) > 1 and not sys.argv[1][0] == '-': iterarg = 0 # default value if len(sys.argv) > 2: iterarg = int(sys.argv[2]) memlim = None # memory limit if len(sys.argv) > 3: memlim = int(sys.argv[3]) resource.setrlimit(resource.RLIMIT_AS, (memlim, memlim)) mrbase_class = loadclassname(os.environ['dumbo_mrbase_class']) jk_class = loadclassname(os.environ['dumbo_jk_class']) runinfo = loadclassname(os.environ['dumbo_runinfo_class'])() if iterarg == iter: if sys.argv[1].startswith('map'): if type(mapper) in (types.ClassType, type): mappercls = type('DumboMapper', (mapper, mrbase_class), {}) mapper = mappercls() if hasattr(mapper, 'configure'): mapconf = mapper.configure if hasattr(mapper, 'close'): mapclose = mapper.close if hasattr(mapper, 'map'): mapper = mapper.map if type(combiner) in (types.ClassType, type): combinercls = type('DumboCombiner', (combiner, mrbase_class), {}) combiner = combinercls() if hasattr(combiner, 'configure'): combconf = combiner.configure if hasattr(combiner, 'close'): combclose = combiner.close if hasattr(combiner, 'reduce'): combiner = combiner.reduce try: print >> sys.stderr, "INFO: consuming %s" % \ os.environ['map_input_file'] except KeyError: pass if os.environ.has_key('stream_map_input') and \ os.environ['stream_map_input'].lower() == 'typedbytes': print >> sys.stderr, "INFO: inputting typed bytes" try: import ctypedbytes as typedbytes except ImportError: import typedbytes inputs = typedbytes.PairedInput(sys.stdin).reads() else: inputs = loadcode(line[:-1] for line in sys.stdin) if mapconf: mapconf() if combconf: combconf() if os.environ.has_key('dumbo_addpath'): path = runinfo.get_input_path() inputs = (((path, k), v) for (k, v) in inputs) if os.environ.has_key('dumbo_joinkeys'): inputs = ((jk_class(k), v) for (k, v) in inputs) if os.environ.has_key('dumbo_parser'): parser = os.environ['dumbo_parser'] clsname = parser.split('.')[-1] modname = '.'.join(parser.split('.')[:-1]) if not modname: raise ImportError(parser) module = __import__(modname, fromlist=[clsname]) parse = getattr(module, clsname)().parse outputs = itermap(inputs, mapper, parse) elif os.environ.has_key('dumbo_record'): record = os.environ['dumbo_record'] clsname = record.split('.')[-1] modname = '.'.join(record.split('.')[:-1]) if not modname: raise ImportError(parser) module = __import__(modname, fromlist=[clsname]) set = getattr(module, clsname)().set outputs = itermap(inputs, mapper, lambda v: set(*v)) else: outputs = itermap(inputs, mapper) if combiner and type(combiner) != str: if (not buffersize) and memlim: buffersize = int(memlim * 0.33) / 512 # educated guess print >> sys.stderr, 'INFO: buffersize =', buffersize inputs = sorted(outputs, buffersize) if os.environ.has_key('dumbo_joinkeys'): outputs = iterreduce(inputs, combiner, keyfunc=jk_class.fromjoinkey) else: outputs = iterreduce(inputs, combiner) if os.environ.has_key('dumbo_joinkeys'): outputs = ((jk.dump(), v) for (jk, v) in outputs) if os.environ.has_key('stream_map_output') and \ os.environ['stream_map_output'].lower() == 'typedbytes': print >> sys.stderr, "INFO: outputting typed bytes" try: import ctypedbytes as typedbytes except ImportError: import typedbytes typedbytes.PairedOutput(sys.stdout).writes(outputs) else: for output in dumpcode(outputs): print '\t'.join(output) if combclose: combclose() if mapclose: mapclose() elif reducer: if type(reducer) in (types.ClassType, type): reducercls = type('DumboReducer', (reducer, mrbase_class), {}) reducer = reducercls() if hasattr(reducer, 'configure'): redconf = reducer.configure if hasattr(reducer, 'close'): redclose = reducer.close if hasattr(reducer, 'reduce'): reducer = reducer.reduce if os.environ.has_key('stream_reduce_input') and \ os.environ['stream_reduce_input'].lower() == 'typedbytes': print >> sys.stderr, "INFO: inputting typed bytes" try: import ctypedbytes as typedbytes except ImportError: import typedbytes inputs = typedbytes.PairedInput(sys.stdin).reads() else: inputs = loadcode(line[:-1] for line in sys.stdin) if redconf: redconf() if os.environ.has_key('dumbo_joinkeys'): outputs = iterreduce(inputs, reducer, keyfunc=jk_class.fromdump) outputs = ((jk.body, v) for (jk, v) in outputs) else: outputs = iterreduce(inputs, reducer) if os.environ.has_key('stream_reduce_output') and \ os.environ['stream_reduce_output'].lower() == 'typedbytes': print >> sys.stderr, "INFO: outputting typed bytes" try: import ctypedbytes as typedbytes except ImportError: import typedbytes typedbytes.PairedOutput(sys.stdout).writes(outputs) else: for output in dumpcode(outputs): print '\t'.join(output) if redclose: redclose() else: for output in dumpcode(inputs): print '\t'.join(output) else: opts = Options(opts) if type(mapper) == str: opts.add('mapper', mapper) elif hasattr(mapper, 'opts'): opts += mapper.opts if type(reducer) == str: opts.add('reducer', reducer) elif hasattr(reducer, 'opts'): opts += reducer.opts if type(combiner) == str: opts.add('combiner', combiner) opts += parseargs(sys.argv[1:]) if input is not None: opts.remove('input') for infile in input: opts.add('input', infile) if output is None: outputopt = opts['output'] if not outputopt: print >> sys.stderr, 'ERROR: No output path specified' sys.exit(1) output = outputopt[0] newopts = Options() newopts.add('output', output) if not reducer: newopts.add('numreducetasks', '0') keys = [k for k, _ in opts if k in newopts] opts.remove(*keys) opts += newopts backend = get_backend(opts) overwriteopt = opts.pop('overwrite') checkoutput = 'no' not in opts.pop('checkoutput') fs = backend.create_filesystem(opts) if 'yes' in overwriteopt: fs.rm(output, opts) elif checkoutput and fs.exists(output, opts) == 0: print >> sys.stderr, 'ERROR: Output path exists already: %s' % output sys.exit(1) opts.add('cmdenv', 'dumbo_mrbase_class=' + \ getclassname(backend.get_mapredbase_class(opts))) opts.add('cmdenv', 'dumbo_jk_class=' + \ getclassname(backend.get_joinkey_class(opts))) opts.add('cmdenv', 'dumbo_runinfo_class=' + \ getclassname(backend.get_runinfo_class(opts))) retval = backend.create_iteration(opts).run() if retval == 127: print >> sys.stderr, 'ERROR: Are you sure that "python" is on your path?' if retval != 0: sys.exit(retval)
def run(self): for (iter, (args, kwargs)) in enumerate(self.iters): kwargs['iter'] = iter if len(sys.argv) > 1 and not sys.argv[1][0] == '-': run(*args, **kwargs) else: opts = kwargs.get('opts', []) opts += parseargs(sys.argv[1:]) # this has to be done early, while all the opts are still there backend = get_backend(opts) fs = backend.create_filesystem(opts) preoutputsopt = getopt(opts, 'preoutputs') delinputsopt = getopt(opts, 'delinputs') addpathopt = getopt(opts, 'addpath', delete=False) getpathopt = getopt(opts, 'getpath', delete=False) job_inputs = getopt(opts, 'input', delete=False) if not job_inputs: print >> sys.stderr, 'ERROR: No input path specified' sys.exit(1) outputopt = getopt(opts, 'output', delete=False) if not outputopt: print >> sys.stderr, 'ERROR: No output path specified' sys.exit(1) job_output = outputopt[0] newopts = {} newopts['iteration'] = str(iter) newopts['itercount'] = str(len(self.iters)) input = kwargs['input'] if type(input) == int: input = [input] if input == [-1]: kwargs['input'] = job_inputs if delinputsopt and delinputsopt[0] == 'yes' and iter == self.deps[-1]: newopts['delinputs'] = 'yes' else: newopts['delinputs'] = 'no' else: if -1 in input: print >> sys.stderr, 'ERROR: Cannot mix job input with intermediate results' sys.exit(1) kwargs['input'] = [job_output + "_pre" + str(initer + 1) for initer in input] newopts['inputformat'] = 'code' if addpathopt and addpathopt[0] == 'yes': # not when == 'iter' newopts['addpath'] = 'no' newopts['delinputs'] = 'no' # we'll take care of it ourselves if iter == len(self.iters) - 1: kwargs['output'] = job_output else: kwargs['output'] = job_output + "_pre" + str(iter + 1) newopts['outputformat'] = 'code' if getpathopt and getpathopt[0] == 'yes': # not when == 'iter' newopts['getpath'] = 'no' (key, delindexes) = (None, []) for (index, (key, value)) in enumerate(opts): if newopts.has_key(key): delindexes.append(index) for delindex in reversed(delindexes): del opts[delindex] opts += newopts.iteritems() kwargs['opts'] = opts run(*args, **kwargs) if not (preoutputsopt and preoutputsopt[0] == 'yes') and input != [-1]: for initer in input: if iter == self.deps[initer]: fs.rm(job_output + "_pre" + str(initer + 1), opts)
def run(self): for (iter, (args, kwargs)) in enumerate(self.iters): kwargs['iter'] = iter if len(sys.argv) > 1 and not sys.argv[1][0] == '-': run(*args, **kwargs) else: opts = kwargs.get('opts', []) opts += parseargs(sys.argv[1:]) preoutputsopt = getopt(opts, 'preoutputs') delinputsopt = getopt(opts, 'delinputs') addpathopt = getopt(opts, 'addpath', delete=False) getpathopt = getopt(opts, 'getpath', delete=False) job_inputs = getopt(opts, 'input', delete=False) if not job_inputs: print >> sys.stderr, 'ERROR: No input path specified' sys.exit(1) outputopt = getopt(opts, 'output', delete=False) if not outputopt: print >> sys.stderr, 'ERROR: No output path specified' sys.exit(1) job_output = outputopt[0] newopts = {} newopts['iteration'] = str(iter) newopts['itercount'] = str(len(self.iters)) input = kwargs['input'] if type(input) == int: input = [input] if input == [-1]: kwargs['input'] = job_inputs if delinputsopt and delinputsopt[0] == 'yes' and iter == self.deps[-1]: newopts['delinputs'] = 'yes' else: newopts['delinputs'] = 'no' else: if -1 in input: print >> sys.stderr, 'ERROR: Cannot mix job input with intermediate results' sys.exit(1) kwargs['input'] = [job_output + "_pre" + str(initer + 1) for initer in input] newopts['inputformat'] = 'code' if addpathopt and addpathopt[0] == 'yes': # not when == 'iter' newopts['addpath'] = 'no' newopts['delinputs'] = 'no' # we'll take care of it ourselves if iter == len(self.iters) - 1: kwargs['output'] = job_output else: kwargs['output'] = job_output + "_pre" + str(iter + 1) newopts['outputformat'] = 'code' if getpathopt and getpathopt[0] == 'yes': # not when == 'iter' newopts['getpath'] = 'no' (key, delindexes) = (None, []) for (index, (key, value)) in enumerate(opts): if newopts.has_key(key): delindexes.append(index) for delindex in reversed(delindexes): del opts[delindex] opts += newopts.iteritems() kwargs['opts'] = opts run(*args, **kwargs) backend = get_backend(opts) fs = backend.create_filesystem(opts) if not (preoutputsopt and preoutputsopt[0] == 'yes') and input != [-1]: for initer in input: if iter == self.deps[initer]: fs.rm(job_output + "_pre" + str(initer + 1), opts)
def run(self): for (iter, (args, kwargs)) in enumerate(self.iters): kwargs["iter"] = iter if len(sys.argv) > 1 and not sys.argv[1][0] == "-": run(*args, **kwargs) else: opts = kwargs.get("opts", []) opts += parseargs(sys.argv[1:]) preoutputsopt = getopt(opts, "preoutputs") delinputsopt = getopt(opts, "delinputs") addpathopt = getopt(opts, "addpath", delete=False) getpathopt = getopt(opts, "getpath", delete=False) job_inputs = getopt(opts, "input", delete=False) if not job_inputs: print >> sys.stderr, "ERROR: No input path specified" sys.exit(1) outputopt = getopt(opts, "output", delete=False) if not outputopt: print >> sys.stderr, "ERROR: No output path specified" sys.exit(1) job_output = outputopt[0] newopts = {} newopts["iteration"] = str(iter) newopts["itercount"] = str(len(self.iters)) input = kwargs["input"] if type(input) == int: input = [input] if input == [-1]: kwargs["input"] = job_inputs if delinputsopt and delinputsopt[0] == "yes" and iter == self.deps[-1]: newopts["delinputs"] = "yes" else: newopts["delinputs"] = "no" else: if -1 in input: print >> sys.stderr, "ERROR: Cannot mix job input with intermediate results" sys.exit(1) kwargs["input"] = [job_output + "_pre" + str(initer + 1) for initer in input] newopts["inputformat"] = "code" if addpathopt and addpathopt[0] == "yes": # not when == 'iter' newopts["addpath"] = "no" newopts["delinputs"] = "no" # we'll take care of it ourselves if iter == len(self.iters) - 1: kwargs["output"] = job_output else: kwargs["output"] = job_output + "_pre" + str(iter + 1) newopts["outputformat"] = "code" if getpathopt and getpathopt[0] == "yes": # not when == 'iter' newopts["getpath"] = "no" (key, delindexes) = (None, []) for (index, (key, value)) in enumerate(opts): if newopts.has_key(key): delindexes.append(index) for delindex in reversed(delindexes): del opts[delindex] opts += newopts.iteritems() kwargs["opts"] = opts run(*args, **kwargs) backend = get_backend(opts) fs = backend.create_filesystem(opts) if not (preoutputsopt and preoutputsopt[0] == "yes") and input != [-1]: for initer in input: if iter == self.deps[initer]: fs.rm(job_output + "_pre" + str(initer + 1), opts)
def run( mapper, reducer=None, combiner=None, buffersize=None, partitioner=None, grouper=None, mapconf=None, redconf=None, combconf=None, mapclose=None, redclose=None, combclose=None, opts=None, input=None, output=None, premapper=None, postreducer=None, iter=0, ): if len(sys.argv) > 1 and not sys.argv[1][0] == "-": # This case corresponds to running a mapper or reducer iterarg = 0 # default value if len(sys.argv) > 2: iterarg = int(sys.argv[2]) memlim = None # memory limit if len(sys.argv) > 3: memlim = int(sys.argv[3]) resource.setrlimit(resource.RLIMIT_AS, (memlim, memlim)) mrbase_class = loadclassname(os.environ["dumbo_mrbase_class"]) jk_class = loadclassname(os.environ["dumbo_jk_class"]) runinfo = loadclassname(os.environ["dumbo_runinfo_class"])() if iterarg == iter: if sys.argv[1].startswith("map"): if type(mapper) in (types.ClassType, type): mappercls = type("DumboMapper", (mapper, mrbase_class), {}) mapper = mappercls() if hasattr(mapper, "configure"): mapconf = mapper.configure if hasattr(mapper, "close"): mapclose = mapper.close if hasattr(mapper, "map"): mapper = mapper.map if type(combiner) in (types.ClassType, type): combinercls = type("DumboCombiner", (combiner, mrbase_class), {}) combiner = combinercls() if hasattr(combiner, "configure"): combconf = combiner.configure if hasattr(combiner, "close"): combclose = combiner.close if hasattr(combiner, "reduce"): combiner = combiner.reduce try: print >> sys.stderr, "INFO: consuming %s" % os.environ["map_input_file"] except KeyError: pass if os.environ.has_key("stream_map_input") and os.environ["stream_map_input"].lower() == "typedbytes": print >> sys.stderr, "INFO: inputting typed bytes" try: import ctypedbytes as typedbytes print >> sys.stderr, "INFO: using ctypedbytes" except ImportError: import typedbytes inputs = typedbytes.PairedInput(sys.stdin).reads() else: inputs = loadcode(line[:-1] for line in sys.stdin) if mapconf: mapconf() if combconf: combconf() if os.environ.has_key("dumbo_addpath"): path = runinfo.get_input_path() inputs = (((path, k), v) for (k, v) in inputs) if os.environ.has_key("dumbo_joinkeys"): inputs = ((jk_class(k), v) for (k, v) in inputs) if os.environ.has_key("dumbo_parser"): parser = os.environ["dumbo_parser"] clsname = parser.split(".")[-1] modname = ".".join(parser.split(".")[:-1]) if not modname: raise ImportError(parser) module = __import__(modname, fromlist=[clsname]) parse = getattr(module, clsname)().parse outputs = itermap(inputs, mapper, parse) elif os.environ.has_key("dumbo_record"): record = os.environ["dumbo_record"] clsname = record.split(".")[-1] modname = ".".join(record.split(".")[:-1]) if not modname: raise ImportError(parser) module = __import__(modname, fromlist=[clsname]) set = getattr(module, clsname)().set outputs = itermap(inputs, mapper, lambda v: set(*v)) else: outputs = itermap(inputs, mapper) if combiner and type(combiner) != str: if (not buffersize) and memlim: buffersize = int(memlim * 0.33) / 512 # educated guess print >> sys.stderr, "INFO: buffersize =", buffersize inputs = sorted(outputs, buffersize) if os.environ.has_key("dumbo_joinkeys"): outputs = iterreduce(inputs, combiner, keyfunc=jk_class.fromjoinkey) elif grouper is not None: outputs = iterreduce(inputs, combiner, grouper=grouper) else: outputs = iterreduce(inputs, combiner) if os.environ.has_key("dumbo_joinkeys"): outputs = ((jk.dump(), v) for (jk, v) in outputs) if os.environ.has_key("stream_map_output") and os.environ["stream_map_output"].lower() == "typedbytes": print >> sys.stderr, "INFO: outputting typed bytes" try: import ctypedbytes as typedbytes print >> sys.stderr, "INFO: using ctypedbytes" except ImportError: import typedbytes typedbytes.PairedOutput(sys.stdout).writes(outputs) else: for output in dumpcode(outputs): print "\t".join(output) if combclose: combclose() if mapclose: mapclose() elif reducer: if type(reducer) in (types.ClassType, type): reducercls = type("DumboReducer", (reducer, mrbase_class), {}) reducer = reducercls() if hasattr(reducer, "configure"): redconf = reducer.configure if hasattr(reducer, "close"): redclose = reducer.close if hasattr(reducer, "reduce"): reducer = reducer.reduce if ( os.environ.has_key("stream_reduce_input") and os.environ["stream_reduce_input"].lower() == "typedbytes" ): print >> sys.stderr, "INFO: inputting typed bytes" try: import ctypedbytes as typedbytes print >> sys.stderr, "INFO: using ctypedbytes" except ImportError: import typedbytes inputs = typedbytes.PairedInput(sys.stdin).reads() else: inputs = loadcode(line[:-1] for line in sys.stdin) if redconf: redconf() if os.environ.has_key("dumbo_joinkeys"): outputs = iterreduce(inputs, reducer, keyfunc=jk_class.fromdump) outputs = ((jk.body, v) for (jk, v) in outputs) elif grouper is not None: outputs = iterreduce(inputs, reducer, grouper=grouper) else: outputs = iterreduce(inputs, reducer) if ( os.environ.has_key("stream_reduce_output") and os.environ["stream_reduce_output"].lower() == "typedbytes" ): print >> sys.stderr, "INFO: outputting typed bytes" try: import ctypedbytes as typedbytes print >> sys.stderr, "INFO: using ctypedbytes" except ImportError: import typedbytes typedbytes.PairedOutput(sys.stdout).writes(outputs) else: for output in dumpcode(outputs): print "\t".join(output) if redclose: redclose() else: for output in dumpcode(inputs): print "\t".join(output) else: # This case builds the hadoop streaming command to run if not opts: opts = [] if type(mapper) == str: opts.append(("mapper", mapper)) elif hasattr(mapper, "opts"): opts += mapper.opts if type(reducer) == str: opts.append(("reducer", reducer)) elif hasattr(reducer, "opts"): opts += reducer.opts if type(combiner) == str: opts.append(("combiner", combiner)) if type(partitioner) == str: opts.append(("partitioner", partitioner)) opts += parseargs(sys.argv[1:]) if input is not None: getopt(opts, "input", delete=True) # delete -input opts for infile in input: opts.append(("input", infile)) if output is None: outputopt = getopt(opts, "output", delete=False) if not outputopt: print >> sys.stderr, "ERROR: No output path specified" sys.exit(1) output = outputopt[0] newopts = {} newopts["output"] = output if not reducer: newopts["numreducetasks"] = "0" (key, delindexes) = (None, []) for (index, (key, value)) in enumerate(opts): if newopts.has_key(key): delindexes.append(index) for delindex in reversed(delindexes): del opts[delindex] opts += newopts.iteritems() backend = get_backend(opts) overwriteopt = getopt(opts, "overwrite") checkoutopt = getopt(opts, "checkoutput") checkoutput = not (checkoutopt and checkoutopt[0] == "no") fs = backend.create_filesystem(opts) if overwriteopt and overwriteopt[0] == "yes": fs.rm(output, opts) elif checkoutput and fs.exists(output, opts) == 0: print >> sys.stderr, "ERROR: Output path exists already: %s" % output sys.exit(1) opts.append(("cmdenv", "dumbo_mrbase_class=" + getclassname(backend.get_mapredbase_class(opts)))) opts.append(("cmdenv", "dumbo_jk_class=" + getclassname(backend.get_joinkey_class(opts)))) opts.append(("cmdenv", "dumbo_runinfo_class=" + getclassname(backend.get_runinfo_class(opts)))) # The function of the premapper is like a starter before a # particular map reduce job. It'll allow you to get the # file system into the right state. if premapper is not None: # TODO Think if these are the right options premapper(backend, fs, opts) retval = backend.create_iteration(opts).run() if retval == 127: print >> sys.stderr, 'ERROR: Are you sure that "python" is on your path?' if retval != 0: sys.exit(retval) # The function of the post-mapper is to do post-processing # on the output of the MapReduce iteration if postreducer is not None: # TODO Think if these are the right options postreducer(backend, fs, opts)
def run(self): if len(sys.argv) > 1 and not sys.argv[1][0] == '-': iterarg = 0 # default value if len(sys.argv) > 2: iterarg = int(sys.argv[2]) # for loop isn't necessary but helps for error reporting apparently for args, kwargs in self.iters[iterarg:iterarg+1]: kwargs['iter'] = iterarg run(*args, **kwargs) else: for _iter, (args, kwargs) in enumerate(self.iters): kwargs['iter'] = _iter opts = Options(kwargs.get('opts', [])) opts += self._argopts # this has to be done early, while all the opts are still there backend = get_backend(opts) fs = backend.create_filesystem(opts) preoutputsopt = opts.pop('preoutputs') delinputsopt = opts.pop('delinputs') # only do this for the first iteration... if iter == 0: # handle inputfile options here; we are past the point where a # bunch of things would get dumped to the commandline. inputfiles = getopt(opts, 'inputfile', delete=True) # this deletes the inputfile options from opts for inputfile in inputfiles: for l in open(inputfile): infile = l.strip() opts.append(("input", infile)) job_inputs = opts['input'] if not job_inputs: print >> sys.stderr, 'ERROR: No input path specified' sys.exit(1) outputopt = opts['output'] if not outputopt: print >> sys.stderr, 'ERROR: No output path specified' sys.exit(1) job_output = outputopt[0] newopts = Options() newopts.add('iteration', str(_iter)) newopts.add('itercount', str(len(self.iters))) _input = kwargs['input'] if type(_input) == int: _input = [_input] if _input == [-1]: kwargs['input'] = job_inputs delinputs = 'yes' if 'yes' in delinputsopt and _iter == self.deps[-1] else 'no' newopts.add('delinputs', delinputs) else: if -1 in _input: print >> sys.stderr, 'ERROR: Cannot mix job input with intermediate results' sys.exit(1) kwargs['input'] = [job_output + "_pre" + str(initer + 1) for initer in _input] newopts.add('inputformat', 'code') if 'yes' in opts['addpath']: # not when == 'iter' newopts.add('addpath', 'no') newopts.add('delinputs', 'no') if _iter == len(self.iters) - 1: kwargs['output'] = job_output else: kwargs['output'] = job_output + "_pre" + str(_iter + 1) newopts.add('outputformat', 'code') if 'yes' in opts['getpath']: # not when == 'iter' newopts.add('getpath', 'no') keys = [k for k, _ in opts if k in newopts] opts.remove(*keys) opts += newopts kwargs['opts'] = opts if "initializer" not in kwargs and self.initializer is not None: kwargs["initializer"] = self.initializer run(*args, **kwargs) if 'yes' not in preoutputsopt and _input != [-1]: for initer in _input: if _iter == self.deps[initer]: fs.rm(job_output + "_pre" + str(initer + 1), opts)
def run(mapper, reducer=None, combiner=None, buffersize=None, mapconf=None, redconf=None, combconf=None, mapclose=None, redclose=None, combclose=None, opts=None, iter=0, itercnt=1): if len(sys.argv) > 1 and not sys.argv[1][0] == '-': iterarg = 0 # default value if len(sys.argv) > 2: iterarg = int(sys.argv[2]) memlim = None # memory limit if len(sys.argv) > 3: memlim = int(sys.argv[3]) resource.setrlimit(resource.RLIMIT_AS, (memlim, memlim)) mrbase_class = loadclassname(os.environ['dumbo_mrbase_class']) jk_class = loadclassname(os.environ['dumbo_jk_class']) if iterarg == iter: if sys.argv[1].startswith('map'): if type(mapper) in (types.ClassType, type): mappercls = type('DumboMapper', (mapper, mrbase_class), {}) mapper = mappercls() if hasattr(mapper, 'configure'): mapconf = mapper.configure if hasattr(mapper, 'close'): mapclose = mapper.close if hasattr(mapper, 'map'): mapper = mapper.map if type(combiner) in (types.ClassType, type): combinercls = type('DumboCombiner', (combiner, mrbase_class), {}) combiner = combinercls() if hasattr(combiner, 'configure'): combconf = combiner.configure if hasattr(combiner, 'close'): combclose = combiner.close if hasattr(combiner, 'reduce'): combiner = combiner.reduce try: print >> sys.stderr, "INFO: consuming %s" % \ os.environ['map_input_file'] except KeyError: pass if os.environ.has_key('stream_map_input') and \ os.environ['stream_map_input'].lower() == 'typedbytes': print >> sys.stderr, "INFO: inputting typed bytes" try: import ctypedbytes as typedbytes except ImportError: import typedbytes inputs = typedbytes.PairedInput(sys.stdin).reads() else: inputs = loadcode(line[:-1] for line in sys.stdin) if mapconf: mapconf() if combconf: combconf() if os.environ.has_key('dumbo_addpath'): path = os.environ['map_input_file'] inputs = (((path, k), v) for (k, v) in inputs) if os.environ.has_key('dumbo_joinkeys'): inputs = ((jk_class(k), v) for (k, v) in inputs) if os.environ.has_key('dumbo_parser'): parser = os.environ['dumbo_parser'] clsname = parser.split('.')[-1] modname = '.'.join(parser.split('.')[:-1]) if not modname: raise ImportError(parser) module = __import__(modname, fromlist=[clsname]) parse = getattr(module, clsname)().parse outputs = itermap(inputs, mapper, parse) elif os.environ.has_key('dumbo_record'): record = os.environ['dumbo_record'] clsname = record.split('.')[-1] modname = '.'.join(record.split('.')[:-1]) if not modname: raise ImportError(parser) module = __import__(modname, fromlist=[clsname]) set = getattr(module, clsname)().set outputs = itermap(inputs, mapper, lambda v: set(*v)) else: outputs = itermap(inputs, mapper) if combiner and type(combiner) != str: if (not buffersize) and memlim: buffersize = int(memlim * 0.33) / 512 # educated guess print >> sys.stderr, 'INFO: buffersize =', buffersize inputs = sorted(outputs, buffersize) if os.environ.has_key('dumbo_joinkeys'): outputs = iterreduce(inputs, combiner, keyfunc=jk_class.fromjoinkey) else: outputs = iterreduce(inputs, combiner) if os.environ.has_key('dumbo_joinkeys'): outputs = ((jk.dump(), v) for (jk, v) in outputs) if os.environ.has_key('stream_map_output') and \ os.environ['stream_map_output'].lower() == 'typedbytes': print >> sys.stderr, "INFO: outputting typed bytes" try: import ctypedbytes as typedbytes except ImportError: import typedbytes typedbytes.PairedOutput(sys.stdout).writes(outputs) else: for output in dumpcode(outputs): print '\t'.join(output) if combclose: combclose() if mapclose: mapclose() elif reducer: if type(reducer) in (types.ClassType, type): reducercls = type('DumboReducer', (reducer, mrbase_class), {}) reducer = reducercls() if hasattr(reducer, 'configure'): redconf = reducer.configure if hasattr(reducer, 'close'): redclose = reducer.close if hasattr(reducer, 'reduce'): reducer = reducer.reduce if os.environ.has_key('stream_reduce_input') and \ os.environ['stream_reduce_input'].lower() == 'typedbytes': print >> sys.stderr, "INFO: inputting typed bytes" try: import ctypedbytes as typedbytes except ImportError: import typedbytes inputs = typedbytes.PairedInput(sys.stdin).reads() else: inputs = loadcode(line[:-1] for line in sys.stdin) if redconf: redconf() if os.environ.has_key('dumbo_joinkeys'): outputs = iterreduce(inputs, reducer, keyfunc=jk_class.fromdump) outputs = ((jk.body, v) for (jk, v) in outputs) else: outputs = iterreduce(inputs, reducer) if os.environ.has_key('stream_reduce_output') and \ os.environ['stream_reduce_output'].lower() == 'typedbytes': print >> sys.stderr, "INFO: outputting typed bytes" try: import ctypedbytes as typedbytes except ImportError: import typedbytes typedbytes.PairedOutput(sys.stdout).writes(outputs) else: for output in dumpcode(outputs): print '\t'.join(output) if redclose: redclose() else: for output in dumpcode(inputs): print '\t'.join(output) else: if not opts: opts = [] if type(mapper) == str: opts.append(('mapper', mapper)) elif hasattr(mapper, 'opts'): opts += mapper.opts if type(reducer) == str: opts.append(('reducer', reducer)) elif hasattr(reducer, 'opts'): opts += reducer.opts if type(combiner) == str: opts.append(('combiner', combiner)) opts += parseargs(sys.argv[1:]) outputopt = getopt(opts, 'output', delete=False) if not outputopt: print >> sys.stderr, 'ERROR: No output path specified' sys.exit(1) output = outputopt[0] newopts = {} newopts['iteration'] = str(iter) newopts['itercount'] = str(itercnt) preoutputsopt = getopt(opts, 'preoutputs') addpathopt = getopt(opts, 'addpath', delete=False) getpathopt = getopt(opts, 'getpath', delete=False) if iter != 0: newopts['input'] = output + "_pre" + str(iter) if not (preoutputsopt and preoutputsopt[0] == 'yes'): newopts['delinputs'] = 'yes' newopts['inputformat'] = 'code' if addpathopt and addpathopt[0] == 'yes': # not when == 'iter' newopts['addpath'] = 'no' if iter < itercnt - 1: output += "_pre" + str(iter + 1) newopts['output'] = output newopts['outputformat'] = 'code' if getpathopt and getpathopt[0] == 'yes': # not when == 'iter' newopts['getpath'] = 'no' if not reducer: newopts['numreducetasks'] = '0' (key, delindexes) = (None, []) for (index, (key, value)) in enumerate(opts): if newopts.has_key(key): delindexes.append(index) for delindex in reversed(delindexes): del opts[delindex] opts += newopts.iteritems() backend = get_backend(opts) overwriteopt = getopt(opts, 'overwrite') checkoutopt = getopt(opts, 'checkoutput') checkoutput = not (checkoutopt and checkoutopt[0] == 'no') fs = backend.create_filesystem(opts) if overwriteopt and overwriteopt[0] == 'yes': fs.rm(output, opts) elif checkoutput and fs.exists(output, opts) == 0: print >> sys.stderr, 'ERROR: Output path exists already: %s' % output sys.exit(1) opts.append(('cmdenv', 'dumbo_mrbase_class=' + \ getclassname(backend.get_mapredbase_class(opts)))) opts.append(('cmdenv', 'dumbo_jk_class=' + \ getclassname(backend.get_joinkey_class(opts)))) retval = backend.create_iteration(opts).run() if retval == 127: print >> sys.stderr, 'ERROR: Are you sure that "python" is on your path?' if retval != 0: sys.exit(retval)