Exemple #1
0
def starter(program):
  from dumbo.backends import get_backend
  def require(a):
    var = program.delopt(a)
    assert var, a
    return var
  
  original_parse_file = require("original_parse")
  original_ref_file = require("original_ref")
  rule_file = require("rule_file")
  

  shell = ShellRunner({})
  shell.subs["CHINESE"] = original_parse_file
  shell.subs["ENGLISH"] = original_ref_file
  shell.subs["P_DIR"] = os.getenv("TRANSFOREST")
  shell.subs["PYTHON"] = os.getenv("PYTHON")
  shell.subs["TMPDIR"] = os.getenv("TMPDIR")
  
  shell.call("$PYTHON $P_DIR/hadoop/zip.py $CHINESE $ENGLISH > $TMPDIR/combined.txt")


  backend = get_backend(program.opts)
  fs = backend.create_filesystem(program.opts)
  fs.put(shell.complete("$TMPDIR/combined.txt"), "combined.txt", program.opts)
  
  program.addopt("input","combined.txt")
  program.addopt("cmdenv","RULE_FILE=%s"%rule_file)
  program.addopt("cmdenv","PYTHON=%s"%os.getenv("PYTHON"))
  program.addopt("cmdenv","TRANSFOREST=%s"%os.getenv("TRANSFOREST"))
  program.addopt("cmdenv","LD_LIBRARY_PATH=%s"%os.getenv("LD_LIBRARY_PATH"))
  program.addopt("cmdenv","PYTHONPATH=%s"%os.getenv("PYTHONPATH"))  
Exemple #2
0
    def run(self):
        if len(sys.argv) > 1 and not sys.argv[1][0] == '-':
            iterarg = 0  # default value
            if len(sys.argv) > 2:
                iterarg = int(sys.argv[2])
            # for loop isn't necessary but helps for error reporting apparently
            for args, kwargs in self.iters[iterarg:iterarg+1]:
                kwargs['iter'] = iterarg
                run(*args, **kwargs)
        else:
            for _iter, (args, kwargs) in enumerate(self.iters):
                kwargs['iter'] = _iter
                opts = Options(kwargs.get('opts', []))
                opts += parseargs(sys.argv[1:])
                
                # this has to be done early, while all the opts are still there
                backend = get_backend(opts)
                fs = backend.create_filesystem(opts)

                preoutputsopt = opts.pop('preoutputs')
                delinputsopt = opts.pop('delinputs')

                job_inputs = opts['input']
                if not job_inputs:
                    print >> sys.stderr, 'ERROR: No input path specified'
                    sys.exit(1)

                outputopt = opts['output']
                if not outputopt:
                    print >> sys.stderr, 'ERROR: No output path specified'
                    sys.exit(1)

                job_output = outputopt[0]

                newopts = Options()
                newopts.add('iteration', str(_iter))
                newopts.add('itercount', str(len(self.iters)))

                _input = kwargs['input']
                if type(_input) == int:
                    _input = [_input]
                if _input == [-1]:
                    kwargs['input'] = job_inputs
                    delinputs = 'yes' if 'yes' in delinputsopt and _iter == self.deps[-1] else 'no'
                    newopts.add('delinputs', delinputs)
                else:
                    if -1 in _input:
                        print >> sys.stderr, 'ERROR: Cannot mix job input with intermediate results'
                        sys.exit(1)
                    kwargs['input'] = [job_output + "_pre" + str(initer + 1) for initer in _input]
                    newopts.add('inputformat', 'code')
                    if 'yes' in opts['addpath']:  # not when == 'iter'
                        newopts.add('addpath', 'no')
                    newopts.add('delinputs', 'no')

                if _iter == len(self.iters) - 1:
                    kwargs['output'] = job_output
                else:
                    kwargs['output'] = job_output + "_pre" + str(_iter + 1)
                    newopts.add('outputformat', 'code')
                    if 'yes' in opts['getpath']:  # not when == 'iter'
                        newopts.add('getpath', 'no')

                keys = [k for k, _ in opts if k in newopts]
                opts.remove(*keys)
                opts += newopts

                kwargs['opts'] = opts

                run(*args, **kwargs)

                if 'yes' not in preoutputsopt and _input != [-1]:
                    for initer in _input:
                        if _iter == self.deps[initer]:
                            fs.rm(job_output + "_pre" + str(initer + 1), opts)
Exemple #3
0
def run(mapper,
        reducer=None,
        combiner=None,
        buffersize=None,
        mapconf=None,
        redconf=None,
        combconf=None,
        mapclose=None,
        redclose=None,
        combclose=None,
        mapcleanup=None,
        redcleanup=None,
        combcleanup=None,
        opts=None,
        input=None,
        output=None,
        iter=0):
    if len(sys.argv) > 1 and not sys.argv[1][0] == '-':
        iterarg = 0  # default value
        if len(sys.argv) > 2:
            iterarg = int(sys.argv[2])
        memlim = None  # memory limit
        if len(sys.argv) > 3:
            memlim = int(sys.argv[3])
            resource.setrlimit(resource.RLIMIT_AS, (memlim, memlim))

        mrbase_class = loadclassname(os.environ['dumbo_mrbase_class'])
        jk_class = loadclassname(os.environ['dumbo_jk_class'])
        runinfo = loadclassname(os.environ['dumbo_runinfo_class'])()

        if iterarg == iter:
            if sys.argv[1].startswith('map'):
                if type(mapper) in (types.ClassType, type):
                    mappercls = type('DumboMapper', (mapper, mrbase_class), {})
                    mapper = mappercls()
                if hasattr(mapper, 'configure'):
                    mapconf = mapper.configure
                if hasattr(mapper, 'close'):
                    mapclose = mapper.close
                if hasattr(mapper, 'map'):
                    mapper = mapper.map
                if hasattr(mapper, 'cleanup'):
                    mapcleanup = mapper.cleanup
                if type(combiner) in (types.ClassType, type):
                    combinercls = type('DumboCombiner', (combiner, mrbase_class), {})
                    combiner = combinercls()
                if hasattr(combiner, 'configure'):
                    combconf = combiner.configure
                if hasattr(combiner, 'close'):
                    combclose = combiner.close
                if hasattr(combiner, 'reduce'):
                    combiner = combiner.reduce
                if hasattr(combiner, 'cleanup'):
                    combcleanup = combiner.cleanup
                try:
                    print >> sys.stderr, "INFO: consuming %s" % \
                                         os.environ['map_input_file']
                except KeyError:
                    pass
                if os.environ.has_key('stream_map_input') and \
                os.environ['stream_map_input'].lower() == 'typedbytes':
                    print >> sys.stderr, "INFO: inputting typed bytes"
                    try: import ctypedbytes as typedbytes
                    except ImportError: import typedbytes
                    inputs = typedbytes.PairedInput(sys.stdin).reads()
                else:
                    inputs = loadcode(line[:-1] for line in sys.stdin)
                if mapconf:
                    mapconf()
                if combconf:
                    combconf()
                if os.environ.has_key('dumbo_addpath'):
                    path = runinfo.get_input_path()
                    inputs = (((path, k), v) for (k, v) in inputs)
                if os.environ.has_key('dumbo_joinkeys'):
                    inputs = ((jk_class(k), v) for (k, v) in inputs)

                if os.environ.has_key('dumbo_parser'):
                    parser = os.environ['dumbo_parser']
                    clsname = parser.split('.')[-1]
                    modname = '.'.join(parser.split('.')[:-1])
                    if not modname:
                        raise ImportError(parser)
                    module = __import__(modname, fromlist=[clsname])
                    parse = getattr(module, clsname)().parse
                    outputs = itermap(inputs, mapper, parse)
                elif os.environ.has_key('dumbo_record'):
                    record = os.environ['dumbo_record']
                    clsname = record.split('.')[-1]
                    modname = '.'.join(record.split('.')[:-1])
                    if not modname:
                        raise ImportError(parser)
                    module = __import__(modname, fromlist=[clsname])
                    set = getattr(module, clsname)().set
                    outputs = itermap(inputs, mapper, lambda v: set(*v))
                else:
                    outputs = itermap(inputs, mapper)
                if mapcleanup:
                    outputs = chain(outputs, mapcleanup())

                # Combiner
                if combiner and type(combiner) != str:
                    if (not buffersize) and memlim:
                        buffersize = int(memlim * 0.33) / 512  # educated guess
                        print >> sys.stderr, 'INFO: buffersize =', buffersize
                    inputs = sorted(outputs, buffersize)
                    if os.environ.has_key('dumbo_joinkeys'):
                        outputs = iterreduce(inputs, combiner,
                                             keyfunc=jk_class.fromjoinkey)
                    else:
                        outputs = iterreduce(inputs, combiner)
                if os.environ.has_key('dumbo_joinkeys'):
                    outputs = ((jk.dump(), v) for (jk, v) in outputs)
                if combcleanup:
                    outputs = chain(outputs, combcleanup())

                if os.environ.has_key('stream_map_output') and \
                os.environ['stream_map_output'].lower() == 'typedbytes':
                    print >> sys.stderr, "INFO: outputting typed bytes"
                    try: import ctypedbytes as typedbytes
                    except ImportError: import typedbytes
                    typedbytes.PairedOutput(sys.stdout).writes(outputs)
                else:
                    for output in dumpcode(outputs):
                        print '\t'.join(output)
                if combclose:
                    combclose()
                if mapclose:
                    mapclose()

            elif reducer:
                # Reducer
                if type(reducer) in (types.ClassType, type):
                    reducercls = type('DumboReducer', (reducer, mrbase_class), {})
                    reducer = reducercls()
                if hasattr(reducer, 'configure'):
                    redconf = reducer.configure
                if hasattr(reducer, 'close'):
                    redclose = reducer.close
                if hasattr(reducer, 'reduce'):
                    reducer = reducer.reduce
                if hasattr(reducer, 'cleanup'):
                    redcleanup = reducer.cleanup
                if os.environ.has_key('stream_reduce_input') and \
                os.environ['stream_reduce_input'].lower() == 'typedbytes':
                    print >> sys.stderr, "INFO: inputting typed bytes"
                    try: import ctypedbytes as typedbytes
                    except ImportError: import typedbytes
                    inputs = typedbytes.PairedInput(sys.stdin).reads()
                else:
                    inputs = loadcode(line[:-1] for line in sys.stdin)
                if redconf:
                    redconf()
                if os.environ.has_key('dumbo_joinkeys'):
                    outputs = iterreduce(inputs, reducer,
                                         keyfunc=jk_class.fromdump)
                    outputs = ((jk.body, v) for (jk, v) in outputs)
                else:
                    outputs = iterreduce(inputs, reducer)
                if redcleanup:
                    outputs = chain(outputs, redcleanup())
                if os.environ.has_key('stream_reduce_output') and \
                os.environ['stream_reduce_output'].lower() == 'typedbytes':
                    print >> sys.stderr, "INFO: outputting typed bytes"
                    try: import ctypedbytes as typedbytes
                    except ImportError: import typedbytes
                    typedbytes.PairedOutput(sys.stdout).writes(outputs)
                else:
                    for output in dumpcode(outputs):
                        print '\t'.join(output)
                if redclose:
                    redclose()
            else:
                for output in dumpcode(inputs):
                    print '\t'.join(output)
    else:
        opts = Options(opts)
        if type(mapper) == str:
            opts.add('mapper', mapper)
        elif hasattr(mapper, 'opts'):
            opts += mapper.opts
        if type(reducer) == str:
            opts.add('reducer', reducer)
        elif hasattr(reducer, 'opts'):
            opts += reducer.opts
        if type(combiner) == str:
            opts.add('combiner', combiner)
        opts += parseargs(sys.argv[1:])

        if input is not None:
            opts.remove('input')
            for infile in input:
                opts.add('input', infile)

        if output is None:
            outputopt = opts['output']
            if not outputopt:
                print >> sys.stderr, 'ERROR: No output path specified'
                sys.exit(1)
            output = outputopt[0]

        newopts = Options()
        newopts.add('output', output)
        if not reducer:
            newopts.add('numreducetasks', '0')

        keys = [k for k, _ in opts if k in newopts]
        opts.remove(*keys)
        opts += newopts

        backend = get_backend(opts)

        overwriteopt = opts.pop('overwrite')
        checkoutput = 'no' not in opts.pop('checkoutput')
        fs = backend.create_filesystem(opts)
        if 'yes' in overwriteopt:
            fs.rm(output, opts)
        elif checkoutput and fs.exists(output, opts) == 0:
            print >> sys.stderr, 'ERROR: Output path exists already: %s' % output
            sys.exit(1)
        
        opts.add('cmdenv', 'dumbo_mrbase_class=' + \
                     getclassname(backend.get_mapredbase_class(opts)))
        opts.add('cmdenv', 'dumbo_jk_class=' + \
                     getclassname(backend.get_joinkey_class(opts)))
        opts.add('cmdenv', 'dumbo_runinfo_class=' + \
                     getclassname(backend.get_runinfo_class(opts)))
        retval = backend.create_iteration(opts).run()
        if retval == 127:
            print >> sys.stderr, 'ERROR: Are you sure that "python" is on your path?'
        if retval != 0:
            sys.exit(retval)
Exemple #4
0
    def run(self):
        if len(sys.argv) > 1 and not sys.argv[1][0] == '-':
            iterarg = 0  # default value
            if len(sys.argv) > 2:
                iterarg = int(sys.argv[2])
            # for loop isn't necessary but helps for error reporting apparently
            for args, kwargs in self.iters[iterarg:iterarg + 1]:
                kwargs['iter'] = iterarg
                run(*args, **kwargs)
        else:
            for _iter, (args, kwargs) in enumerate(self.iters):
                kwargs['iter'] = _iter
                opts = Options(kwargs.get('opts', []))
                opts += parseargs(sys.argv[1:])

                # this has to be done early, while all the opts are still there
                backend = get_backend(opts)
                fs = backend.create_filesystem(opts)

                preoutputsopt = opts.pop('preoutputs')
                delinputsopt = opts.pop('delinputs')

                job_inputs = opts['input']
                if not job_inputs:
                    print >> sys.stderr, 'ERROR: No input path specified'
                    sys.exit(1)

                outputopt = opts['output']
                if not outputopt:
                    print >> sys.stderr, 'ERROR: No output path specified'
                    sys.exit(1)

                job_output = outputopt[0]

                newopts = Options()
                newopts.add('iteration', str(_iter))
                newopts.add('itercount', str(len(self.iters)))

                _input = kwargs['input']
                if type(_input) == int:
                    _input = [_input]
                if _input == [-1]:
                    kwargs['input'] = job_inputs
                    delinputs = 'yes' if 'yes' in delinputsopt and _iter == self.deps[
                        -1] else 'no'
                    newopts.add('delinputs', delinputs)
                else:
                    if -1 in _input:
                        print >> sys.stderr, 'ERROR: Cannot mix job input with intermediate results'
                        sys.exit(1)
                    kwargs['input'] = [
                        job_output + "_pre" + str(initer + 1)
                        for initer in _input
                    ]
                    newopts.add('inputformat', 'code')
                    if 'yes' in opts['addpath']:  # not when == 'iter'
                        newopts.add('addpath', 'no')
                    newopts.add('delinputs', 'no')

                if _iter == len(self.iters) - 1:
                    kwargs['output'] = job_output
                else:
                    kwargs['output'] = job_output + "_pre" + str(_iter + 1)
                    newopts.add('outputformat', 'code')
                    if 'yes' in opts['getpath']:  # not when == 'iter'
                        newopts.add('getpath', 'no')

                keys = [k for k, _ in opts if k in newopts]
                opts.remove(*keys)
                opts += newopts

                kwargs['opts'] = opts

                run(*args, **kwargs)

                if 'yes' not in preoutputsopt and _input != [-1]:
                    for initer in _input:
                        if _iter == self.deps[initer]:
                            fs.rm(job_output + "_pre" + str(initer + 1), opts)
Exemple #5
0
def run(mapper,
        reducer=None,
        combiner=None,
        buffersize=None,
        mapconf=None,
        redconf=None,
        combconf=None,
        mapclose=None,
        redclose=None,
        combclose=None,
        opts=None,
        input=None,
        output=None,
        iter=0):
    if len(sys.argv) > 1 and not sys.argv[1][0] == '-':
        iterarg = 0  # default value
        if len(sys.argv) > 2:
            iterarg = int(sys.argv[2])
        memlim = None  # memory limit
        if len(sys.argv) > 3:
            memlim = int(sys.argv[3])
            resource.setrlimit(resource.RLIMIT_AS, (memlim, memlim))

        mrbase_class = loadclassname(os.environ['dumbo_mrbase_class'])
        jk_class = loadclassname(os.environ['dumbo_jk_class'])
        runinfo = loadclassname(os.environ['dumbo_runinfo_class'])()

        if iterarg == iter:
            if sys.argv[1].startswith('map'):
                if type(mapper) in (types.ClassType, type):
                    mappercls = type('DumboMapper', (mapper, mrbase_class), {})
                    mapper = mappercls()
                if hasattr(mapper, 'configure'):
                    mapconf = mapper.configure
                if hasattr(mapper, 'close'):
                    mapclose = mapper.close
                if hasattr(mapper, 'map'):
                    mapper = mapper.map
                if type(combiner) in (types.ClassType, type):
                    combinercls = type('DumboCombiner',
                                       (combiner, mrbase_class), {})
                    combiner = combinercls()
                if hasattr(combiner, 'configure'):
                    combconf = combiner.configure
                if hasattr(combiner, 'close'):
                    combclose = combiner.close
                if hasattr(combiner, 'reduce'):
                    combiner = combiner.reduce
                try:
                    print >> sys.stderr, "INFO: consuming %s" % \
                                         os.environ['map_input_file']
                except KeyError:
                    pass
                if os.environ.has_key('stream_map_input') and \
                os.environ['stream_map_input'].lower() == 'typedbytes':
                    print >> sys.stderr, "INFO: inputting typed bytes"
                    try:
                        import ctypedbytes as typedbytes
                    except ImportError:
                        import typedbytes
                    inputs = typedbytes.PairedInput(sys.stdin).reads()
                else:
                    inputs = loadcode(line[:-1] for line in sys.stdin)
                if mapconf:
                    mapconf()
                if combconf:
                    combconf()
                if os.environ.has_key('dumbo_addpath'):
                    path = runinfo.get_input_path()
                    inputs = (((path, k), v) for (k, v) in inputs)
                if os.environ.has_key('dumbo_joinkeys'):
                    inputs = ((jk_class(k), v) for (k, v) in inputs)
                if os.environ.has_key('dumbo_parser'):
                    parser = os.environ['dumbo_parser']
                    clsname = parser.split('.')[-1]
                    modname = '.'.join(parser.split('.')[:-1])
                    if not modname:
                        raise ImportError(parser)
                    module = __import__(modname, fromlist=[clsname])
                    parse = getattr(module, clsname)().parse
                    outputs = itermap(inputs, mapper, parse)
                elif os.environ.has_key('dumbo_record'):
                    record = os.environ['dumbo_record']
                    clsname = record.split('.')[-1]
                    modname = '.'.join(record.split('.')[:-1])
                    if not modname:
                        raise ImportError(parser)
                    module = __import__(modname, fromlist=[clsname])
                    set = getattr(module, clsname)().set
                    outputs = itermap(inputs, mapper, lambda v: set(*v))
                else:
                    outputs = itermap(inputs, mapper)
                if combiner and type(combiner) != str:
                    if (not buffersize) and memlim:
                        buffersize = int(memlim * 0.33) / 512  # educated guess
                        print >> sys.stderr, 'INFO: buffersize =', buffersize
                    inputs = sorted(outputs, buffersize)
                    if os.environ.has_key('dumbo_joinkeys'):
                        outputs = iterreduce(inputs,
                                             combiner,
                                             keyfunc=jk_class.fromjoinkey)
                    else:
                        outputs = iterreduce(inputs, combiner)
                if os.environ.has_key('dumbo_joinkeys'):
                    outputs = ((jk.dump(), v) for (jk, v) in outputs)
                if os.environ.has_key('stream_map_output') and \
                os.environ['stream_map_output'].lower() == 'typedbytes':
                    print >> sys.stderr, "INFO: outputting typed bytes"
                    try:
                        import ctypedbytes as typedbytes
                    except ImportError:
                        import typedbytes
                    typedbytes.PairedOutput(sys.stdout).writes(outputs)
                else:
                    for output in dumpcode(outputs):
                        print '\t'.join(output)
                if combclose:
                    combclose()
                if mapclose:
                    mapclose()
            elif reducer:
                if type(reducer) in (types.ClassType, type):
                    reducercls = type('DumboReducer', (reducer, mrbase_class),
                                      {})
                    reducer = reducercls()
                if hasattr(reducer, 'configure'):
                    redconf = reducer.configure
                if hasattr(reducer, 'close'):
                    redclose = reducer.close
                if hasattr(reducer, 'reduce'):
                    reducer = reducer.reduce
                if os.environ.has_key('stream_reduce_input') and \
                os.environ['stream_reduce_input'].lower() == 'typedbytes':
                    print >> sys.stderr, "INFO: inputting typed bytes"
                    try:
                        import ctypedbytes as typedbytes
                    except ImportError:
                        import typedbytes
                    inputs = typedbytes.PairedInput(sys.stdin).reads()
                else:
                    inputs = loadcode(line[:-1] for line in sys.stdin)
                if redconf:
                    redconf()
                if os.environ.has_key('dumbo_joinkeys'):
                    outputs = iterreduce(inputs,
                                         reducer,
                                         keyfunc=jk_class.fromdump)
                    outputs = ((jk.body, v) for (jk, v) in outputs)
                else:
                    outputs = iterreduce(inputs, reducer)
                if os.environ.has_key('stream_reduce_output') and \
                os.environ['stream_reduce_output'].lower() == 'typedbytes':
                    print >> sys.stderr, "INFO: outputting typed bytes"
                    try:
                        import ctypedbytes as typedbytes
                    except ImportError:
                        import typedbytes
                    typedbytes.PairedOutput(sys.stdout).writes(outputs)
                else:
                    for output in dumpcode(outputs):
                        print '\t'.join(output)
                if redclose:
                    redclose()
            else:
                for output in dumpcode(inputs):
                    print '\t'.join(output)
    else:
        opts = Options(opts)
        if type(mapper) == str:
            opts.add('mapper', mapper)
        elif hasattr(mapper, 'opts'):
            opts += mapper.opts
        if type(reducer) == str:
            opts.add('reducer', reducer)
        elif hasattr(reducer, 'opts'):
            opts += reducer.opts
        if type(combiner) == str:
            opts.add('combiner', combiner)
        opts += parseargs(sys.argv[1:])

        if input is not None:
            opts.remove('input')
            for infile in input:
                opts.add('input', infile)

        if output is None:
            outputopt = opts['output']
            if not outputopt:
                print >> sys.stderr, 'ERROR: No output path specified'
                sys.exit(1)
            output = outputopt[0]

        newopts = Options()
        newopts.add('output', output)
        if not reducer:
            newopts.add('numreducetasks', '0')

        keys = [k for k, _ in opts if k in newopts]
        opts.remove(*keys)
        opts += newopts

        backend = get_backend(opts)

        overwriteopt = opts.pop('overwrite')
        checkoutput = 'no' not in opts.pop('checkoutput')
        fs = backend.create_filesystem(opts)
        if 'yes' in overwriteopt:
            fs.rm(output, opts)
        elif checkoutput and fs.exists(output, opts) == 0:
            print >> sys.stderr, 'ERROR: Output path exists already: %s' % output
            sys.exit(1)

        opts.add('cmdenv', 'dumbo_mrbase_class=' + \
                     getclassname(backend.get_mapredbase_class(opts)))
        opts.add('cmdenv', 'dumbo_jk_class=' + \
                     getclassname(backend.get_joinkey_class(opts)))
        opts.add('cmdenv', 'dumbo_runinfo_class=' + \
                     getclassname(backend.get_runinfo_class(opts)))
        retval = backend.create_iteration(opts).run()
        if retval == 127:
            print >> sys.stderr, 'ERROR: Are you sure that "python" is on your path?'
        if retval != 0:
            sys.exit(retval)
Exemple #6
0
    def run(self):
        for (iter, (args, kwargs)) in enumerate(self.iters):
            kwargs['iter'] = iter
                
            if len(sys.argv) > 1 and not sys.argv[1][0] == '-':
                run(*args, **kwargs)
            else:
                opts = kwargs.get('opts', [])
                opts += parseargs(sys.argv[1:])
                
                # this has to be done early, while all the opts are still there
                backend = get_backend(opts)
                fs = backend.create_filesystem(opts)

                preoutputsopt = getopt(opts, 'preoutputs')
                delinputsopt = getopt(opts, 'delinputs')
                addpathopt = getopt(opts, 'addpath', delete=False)
                getpathopt = getopt(opts, 'getpath', delete=False)

                job_inputs = getopt(opts, 'input', delete=False)
                if not job_inputs:
                    print >> sys.stderr, 'ERROR: No input path specified'
                    sys.exit(1)

                outputopt = getopt(opts, 'output', delete=False)
                if not outputopt:
                    print >> sys.stderr, 'ERROR: No output path specified'
                    sys.exit(1)
                job_output = outputopt[0]

                newopts = {}
                newopts['iteration'] = str(iter)
                newopts['itercount'] = str(len(self.iters))

                input = kwargs['input']
                if type(input) == int:
                    input = [input]
                if input == [-1]:
                    kwargs['input'] = job_inputs
                    if delinputsopt and delinputsopt[0] == 'yes' and iter == self.deps[-1]:
                        newopts['delinputs'] = 'yes'
                    else:
                        newopts['delinputs'] = 'no'
                else:
                    if -1 in input:
                        print >> sys.stderr, 'ERROR: Cannot mix job input with intermediate results'
                        sys.exit(1)
                    kwargs['input'] = [job_output + "_pre" + str(initer + 1) for initer in input]
                    newopts['inputformat'] = 'code'
                    if addpathopt and addpathopt[0] == 'yes':  # not when == 'iter'
                        newopts['addpath'] = 'no'
                    newopts['delinputs'] = 'no' # we'll take care of it ourselves

                if iter == len(self.iters) - 1:
                    kwargs['output'] = job_output
                else:
                    kwargs['output'] = job_output + "_pre" + str(iter + 1)
                    newopts['outputformat'] = 'code'
                    if getpathopt and getpathopt[0] == 'yes':  # not when == 'iter'
                        newopts['getpath'] = 'no'

                (key, delindexes) = (None, [])
                for (index, (key, value)) in enumerate(opts):
                    if newopts.has_key(key):
                        delindexes.append(index)
                for delindex in reversed(delindexes):
                    del opts[delindex]
                opts += newopts.iteritems()
                kwargs['opts'] = opts

                run(*args, **kwargs)

                if not (preoutputsopt and preoutputsopt[0] == 'yes') and input != [-1]:
                    for initer in input:
                        if iter == self.deps[initer]:
                            fs.rm(job_output + "_pre" + str(initer + 1), opts)
Exemple #7
0
    def run(self):
        for (iter, (args, kwargs)) in enumerate(self.iters):
            kwargs['iter'] = iter
                
            if len(sys.argv) > 1 and not sys.argv[1][0] == '-':
                run(*args, **kwargs)
            else:
                opts = kwargs.get('opts', [])
                opts += parseargs(sys.argv[1:])

                preoutputsopt = getopt(opts, 'preoutputs')
                delinputsopt = getopt(opts, 'delinputs')
                addpathopt = getopt(opts, 'addpath', delete=False)
                getpathopt = getopt(opts, 'getpath', delete=False)

                job_inputs = getopt(opts, 'input', delete=False)
                if not job_inputs:
                    print >> sys.stderr, 'ERROR: No input path specified'
                    sys.exit(1)

                outputopt = getopt(opts, 'output', delete=False)
                if not outputopt:
                    print >> sys.stderr, 'ERROR: No output path specified'
                    sys.exit(1)
                job_output = outputopt[0]

                newopts = {}
                newopts['iteration'] = str(iter)
                newopts['itercount'] = str(len(self.iters))

                input = kwargs['input']
                if type(input) == int:
                    input = [input]
                if input == [-1]:
                    kwargs['input'] = job_inputs
                    if delinputsopt and delinputsopt[0] == 'yes' and iter == self.deps[-1]:
                        newopts['delinputs'] = 'yes'
                    else:
                        newopts['delinputs'] = 'no'
                else:
                    if -1 in input:
                        print >> sys.stderr, 'ERROR: Cannot mix job input with intermediate results'
                        sys.exit(1)
                    kwargs['input'] = [job_output + "_pre" + str(initer + 1) for initer in input]
                    newopts['inputformat'] = 'code'
                    if addpathopt and addpathopt[0] == 'yes':  # not when == 'iter'
                        newopts['addpath'] = 'no'
                    newopts['delinputs'] = 'no' # we'll take care of it ourselves

                if iter == len(self.iters) - 1:
                    kwargs['output'] = job_output
                else:
                    kwargs['output'] = job_output + "_pre" + str(iter + 1)
                    newopts['outputformat'] = 'code'
                    if getpathopt and getpathopt[0] == 'yes':  # not when == 'iter'
                        newopts['getpath'] = 'no'

                (key, delindexes) = (None, [])
                for (index, (key, value)) in enumerate(opts):
                    if newopts.has_key(key):
                        delindexes.append(index)
                for delindex in reversed(delindexes):
                    del opts[delindex]
                opts += newopts.iteritems()
                kwargs['opts'] = opts

                run(*args, **kwargs)

                backend = get_backend(opts)
                fs = backend.create_filesystem(opts)
                if not (preoutputsopt and preoutputsopt[0] == 'yes') and input != [-1]:
                    for initer in input:
                        if iter == self.deps[initer]:
                            fs.rm(job_output + "_pre" + str(initer + 1), opts)
Exemple #8
0
    def run(self):
        for (iter, (args, kwargs)) in enumerate(self.iters):
            kwargs["iter"] = iter

            if len(sys.argv) > 1 and not sys.argv[1][0] == "-":
                run(*args, **kwargs)
            else:
                opts = kwargs.get("opts", [])
                opts += parseargs(sys.argv[1:])

                preoutputsopt = getopt(opts, "preoutputs")
                delinputsopt = getopt(opts, "delinputs")
                addpathopt = getopt(opts, "addpath", delete=False)
                getpathopt = getopt(opts, "getpath", delete=False)

                job_inputs = getopt(opts, "input", delete=False)
                if not job_inputs:
                    print >> sys.stderr, "ERROR: No input path specified"
                    sys.exit(1)

                outputopt = getopt(opts, "output", delete=False)
                if not outputopt:
                    print >> sys.stderr, "ERROR: No output path specified"
                    sys.exit(1)
                job_output = outputopt[0]

                newopts = {}
                newopts["iteration"] = str(iter)
                newopts["itercount"] = str(len(self.iters))

                input = kwargs["input"]
                if type(input) == int:
                    input = [input]
                if input == [-1]:
                    kwargs["input"] = job_inputs
                    if delinputsopt and delinputsopt[0] == "yes" and iter == self.deps[-1]:
                        newopts["delinputs"] = "yes"
                    else:
                        newopts["delinputs"] = "no"
                else:
                    if -1 in input:
                        print >> sys.stderr, "ERROR: Cannot mix job input with intermediate results"
                        sys.exit(1)
                    kwargs["input"] = [job_output + "_pre" + str(initer + 1) for initer in input]
                    newopts["inputformat"] = "code"
                    if addpathopt and addpathopt[0] == "yes":  # not when == 'iter'
                        newopts["addpath"] = "no"
                    newopts["delinputs"] = "no"  # we'll take care of it ourselves

                if iter == len(self.iters) - 1:
                    kwargs["output"] = job_output
                else:
                    kwargs["output"] = job_output + "_pre" + str(iter + 1)
                    newopts["outputformat"] = "code"
                    if getpathopt and getpathopt[0] == "yes":  # not when == 'iter'
                        newopts["getpath"] = "no"

                (key, delindexes) = (None, [])
                for (index, (key, value)) in enumerate(opts):
                    if newopts.has_key(key):
                        delindexes.append(index)
                for delindex in reversed(delindexes):
                    del opts[delindex]
                opts += newopts.iteritems()
                kwargs["opts"] = opts

                run(*args, **kwargs)

                backend = get_backend(opts)
                fs = backend.create_filesystem(opts)
                if not (preoutputsopt and preoutputsopt[0] == "yes") and input != [-1]:
                    for initer in input:
                        if iter == self.deps[initer]:
                            fs.rm(job_output + "_pre" + str(initer + 1), opts)
Exemple #9
0
def run(
    mapper,
    reducer=None,
    combiner=None,
    buffersize=None,
    partitioner=None,
    grouper=None,
    mapconf=None,
    redconf=None,
    combconf=None,
    mapclose=None,
    redclose=None,
    combclose=None,
    opts=None,
    input=None,
    output=None,
    premapper=None,
    postreducer=None,
    iter=0,
):

    if len(sys.argv) > 1 and not sys.argv[1][0] == "-":
        # This case corresponds to running a mapper or reducer

        iterarg = 0  # default value
        if len(sys.argv) > 2:
            iterarg = int(sys.argv[2])
        memlim = None  # memory limit
        if len(sys.argv) > 3:
            memlim = int(sys.argv[3])
            resource.setrlimit(resource.RLIMIT_AS, (memlim, memlim))

        mrbase_class = loadclassname(os.environ["dumbo_mrbase_class"])
        jk_class = loadclassname(os.environ["dumbo_jk_class"])
        runinfo = loadclassname(os.environ["dumbo_runinfo_class"])()

        if iterarg == iter:
            if sys.argv[1].startswith("map"):
                if type(mapper) in (types.ClassType, type):
                    mappercls = type("DumboMapper", (mapper, mrbase_class), {})
                    mapper = mappercls()
                if hasattr(mapper, "configure"):
                    mapconf = mapper.configure
                if hasattr(mapper, "close"):
                    mapclose = mapper.close
                if hasattr(mapper, "map"):
                    mapper = mapper.map
                if type(combiner) in (types.ClassType, type):
                    combinercls = type("DumboCombiner", (combiner, mrbase_class), {})
                    combiner = combinercls()
                if hasattr(combiner, "configure"):
                    combconf = combiner.configure
                if hasattr(combiner, "close"):
                    combclose = combiner.close
                if hasattr(combiner, "reduce"):
                    combiner = combiner.reduce
                try:
                    print >> sys.stderr, "INFO: consuming %s" % os.environ["map_input_file"]
                except KeyError:
                    pass
                if os.environ.has_key("stream_map_input") and os.environ["stream_map_input"].lower() == "typedbytes":
                    print >> sys.stderr, "INFO: inputting typed bytes"
                    try:
                        import ctypedbytes as typedbytes

                        print >> sys.stderr, "INFO: using ctypedbytes"
                    except ImportError:
                        import typedbytes
                    inputs = typedbytes.PairedInput(sys.stdin).reads()
                else:
                    inputs = loadcode(line[:-1] for line in sys.stdin)
                if mapconf:
                    mapconf()
                if combconf:
                    combconf()
                if os.environ.has_key("dumbo_addpath"):
                    path = runinfo.get_input_path()
                    inputs = (((path, k), v) for (k, v) in inputs)
                if os.environ.has_key("dumbo_joinkeys"):
                    inputs = ((jk_class(k), v) for (k, v) in inputs)
                if os.environ.has_key("dumbo_parser"):
                    parser = os.environ["dumbo_parser"]
                    clsname = parser.split(".")[-1]
                    modname = ".".join(parser.split(".")[:-1])
                    if not modname:
                        raise ImportError(parser)
                    module = __import__(modname, fromlist=[clsname])
                    parse = getattr(module, clsname)().parse
                    outputs = itermap(inputs, mapper, parse)
                elif os.environ.has_key("dumbo_record"):
                    record = os.environ["dumbo_record"]
                    clsname = record.split(".")[-1]
                    modname = ".".join(record.split(".")[:-1])
                    if not modname:
                        raise ImportError(parser)
                    module = __import__(modname, fromlist=[clsname])
                    set = getattr(module, clsname)().set
                    outputs = itermap(inputs, mapper, lambda v: set(*v))
                else:
                    outputs = itermap(inputs, mapper)
                if combiner and type(combiner) != str:
                    if (not buffersize) and memlim:
                        buffersize = int(memlim * 0.33) / 512  # educated guess
                        print >> sys.stderr, "INFO: buffersize =", buffersize
                    inputs = sorted(outputs, buffersize)
                    if os.environ.has_key("dumbo_joinkeys"):
                        outputs = iterreduce(inputs, combiner, keyfunc=jk_class.fromjoinkey)
                    elif grouper is not None:
                        outputs = iterreduce(inputs, combiner, grouper=grouper)
                    else:
                        outputs = iterreduce(inputs, combiner)
                if os.environ.has_key("dumbo_joinkeys"):
                    outputs = ((jk.dump(), v) for (jk, v) in outputs)
                if os.environ.has_key("stream_map_output") and os.environ["stream_map_output"].lower() == "typedbytes":
                    print >> sys.stderr, "INFO: outputting typed bytes"
                    try:
                        import ctypedbytes as typedbytes

                        print >> sys.stderr, "INFO: using ctypedbytes"
                    except ImportError:
                        import typedbytes
                    typedbytes.PairedOutput(sys.stdout).writes(outputs)
                else:
                    for output in dumpcode(outputs):
                        print "\t".join(output)
                if combclose:
                    combclose()
                if mapclose:
                    mapclose()
            elif reducer:
                if type(reducer) in (types.ClassType, type):
                    reducercls = type("DumboReducer", (reducer, mrbase_class), {})
                    reducer = reducercls()
                if hasattr(reducer, "configure"):
                    redconf = reducer.configure
                if hasattr(reducer, "close"):
                    redclose = reducer.close
                if hasattr(reducer, "reduce"):
                    reducer = reducer.reduce
                if (
                    os.environ.has_key("stream_reduce_input")
                    and os.environ["stream_reduce_input"].lower() == "typedbytes"
                ):
                    print >> sys.stderr, "INFO: inputting typed bytes"
                    try:
                        import ctypedbytes as typedbytes

                        print >> sys.stderr, "INFO: using ctypedbytes"
                    except ImportError:
                        import typedbytes
                    inputs = typedbytes.PairedInput(sys.stdin).reads()
                else:
                    inputs = loadcode(line[:-1] for line in sys.stdin)
                if redconf:
                    redconf()
                if os.environ.has_key("dumbo_joinkeys"):
                    outputs = iterreduce(inputs, reducer, keyfunc=jk_class.fromdump)
                    outputs = ((jk.body, v) for (jk, v) in outputs)
                elif grouper is not None:
                    outputs = iterreduce(inputs, reducer, grouper=grouper)
                else:
                    outputs = iterreduce(inputs, reducer)
                if (
                    os.environ.has_key("stream_reduce_output")
                    and os.environ["stream_reduce_output"].lower() == "typedbytes"
                ):
                    print >> sys.stderr, "INFO: outputting typed bytes"
                    try:
                        import ctypedbytes as typedbytes

                        print >> sys.stderr, "INFO: using ctypedbytes"
                    except ImportError:
                        import typedbytes
                    typedbytes.PairedOutput(sys.stdout).writes(outputs)
                else:
                    for output in dumpcode(outputs):
                        print "\t".join(output)
                if redclose:
                    redclose()
            else:
                for output in dumpcode(inputs):
                    print "\t".join(output)
    else:
        # This case builds the hadoop streaming command to run
        if not opts:
            opts = []
        if type(mapper) == str:
            opts.append(("mapper", mapper))
        elif hasattr(mapper, "opts"):
            opts += mapper.opts
        if type(reducer) == str:
            opts.append(("reducer", reducer))
        elif hasattr(reducer, "opts"):
            opts += reducer.opts
        if type(combiner) == str:
            opts.append(("combiner", combiner))
        if type(partitioner) == str:
            opts.append(("partitioner", partitioner))
        opts += parseargs(sys.argv[1:])

        if input is not None:
            getopt(opts, "input", delete=True)  # delete -input opts
            for infile in input:
                opts.append(("input", infile))

        if output is None:
            outputopt = getopt(opts, "output", delete=False)
            if not outputopt:
                print >> sys.stderr, "ERROR: No output path specified"
                sys.exit(1)
            output = outputopt[0]

        newopts = {}

        newopts["output"] = output
        if not reducer:
            newopts["numreducetasks"] = "0"

        (key, delindexes) = (None, [])
        for (index, (key, value)) in enumerate(opts):
            if newopts.has_key(key):
                delindexes.append(index)
        for delindex in reversed(delindexes):
            del opts[delindex]
        opts += newopts.iteritems()

        backend = get_backend(opts)

        overwriteopt = getopt(opts, "overwrite")
        checkoutopt = getopt(opts, "checkoutput")
        checkoutput = not (checkoutopt and checkoutopt[0] == "no")
        fs = backend.create_filesystem(opts)
        if overwriteopt and overwriteopt[0] == "yes":
            fs.rm(output, opts)
        elif checkoutput and fs.exists(output, opts) == 0:
            print >> sys.stderr, "ERROR: Output path exists already: %s" % output
            sys.exit(1)

        opts.append(("cmdenv", "dumbo_mrbase_class=" + getclassname(backend.get_mapredbase_class(opts))))
        opts.append(("cmdenv", "dumbo_jk_class=" + getclassname(backend.get_joinkey_class(opts))))
        opts.append(("cmdenv", "dumbo_runinfo_class=" + getclassname(backend.get_runinfo_class(opts))))

        # The function of the premapper is like a starter before a
        # particular map reduce job.  It'll allow you to get the
        # file system into the right state.
        if premapper is not None:
            # TODO Think if these are the right options
            premapper(backend, fs, opts)

        retval = backend.create_iteration(opts).run()
        if retval == 127:
            print >> sys.stderr, 'ERROR: Are you sure that "python" is on your path?'

        if retval != 0:
            sys.exit(retval)

        # The function of the post-mapper is to do post-processing
        # on the output of the MapReduce iteration
        if postreducer is not None:
            # TODO Think if these are the right options
            postreducer(backend, fs, opts)
Exemple #10
0
    def run(self):
        if len(sys.argv) > 1 and not sys.argv[1][0] == '-':
            iterarg = 0  # default value
            if len(sys.argv) > 2:
                iterarg = int(sys.argv[2])
            # for loop isn't necessary but helps for error reporting apparently
            for args, kwargs in self.iters[iterarg:iterarg+1]:
                kwargs['iter'] = iterarg
                run(*args, **kwargs)
        else:
            for _iter, (args, kwargs) in enumerate(self.iters):
                kwargs['iter'] = _iter
                opts = Options(kwargs.get('opts', []))
                opts += self._argopts
                
                # this has to be done early, while all the opts are still there
                backend = get_backend(opts)
                fs = backend.create_filesystem(opts)

                preoutputsopt = opts.pop('preoutputs')
                delinputsopt = opts.pop('delinputs')

                # only do this for the first iteration...
                if iter == 0:
                    # handle inputfile options here; we are past the point where a 
                    # bunch of things would get dumped to the commandline.
                    inputfiles = getopt(opts, 'inputfile', delete=True) # this deletes the inputfile options from opts
                    for inputfile in inputfiles:
                        for l in open(inputfile):
                            infile = l.strip()
                            opts.append(("input", infile))

                job_inputs = opts['input']
                if not job_inputs:
                    print >> sys.stderr, 'ERROR: No input path specified'
                    sys.exit(1)

                outputopt = opts['output']
                if not outputopt:
                    print >> sys.stderr, 'ERROR: No output path specified'
                    sys.exit(1)

                job_output = outputopt[0]

                newopts = Options()
                newopts.add('iteration', str(_iter))
                newopts.add('itercount', str(len(self.iters)))

                _input = kwargs['input']
                if type(_input) == int:
                    _input = [_input]
                if _input == [-1]:
                    kwargs['input'] = job_inputs
                    delinputs = 'yes' if 'yes' in delinputsopt and _iter == self.deps[-1] else 'no'
                    newopts.add('delinputs', delinputs)
                else:
                    if -1 in _input:
                        print >> sys.stderr, 'ERROR: Cannot mix job input with intermediate results'
                        sys.exit(1)
                    kwargs['input'] = [job_output + "_pre" + str(initer + 1) for initer in _input]
                    newopts.add('inputformat', 'code')
                    if 'yes' in opts['addpath']:  # not when == 'iter'
                        newopts.add('addpath', 'no')
                    newopts.add('delinputs', 'no')

                if _iter == len(self.iters) - 1:
                    kwargs['output'] = job_output
                else:
                    kwargs['output'] = job_output + "_pre" + str(_iter + 1)
                    newopts.add('outputformat', 'code')
                    if 'yes' in opts['getpath']:  # not when == 'iter'
                        newopts.add('getpath', 'no')

                keys = [k for k, _ in opts if k in newopts]
                opts.remove(*keys)
                opts += newopts

                kwargs['opts'] = opts

                if "initializer" not in kwargs and self.initializer is not None:
                    kwargs["initializer"] = self.initializer

                run(*args, **kwargs)

                if 'yes' not in preoutputsopt and _input != [-1]:
                    for initer in _input:
                        if _iter == self.deps[initer]:
                            fs.rm(job_output + "_pre" + str(initer + 1), opts)
Exemple #11
0
def run(mapper,
        reducer=None,
        combiner=None,
        buffersize=None,
        mapconf=None,
        redconf=None,
        combconf=None,
        mapclose=None,
        redclose=None,
        combclose=None,
        opts=None,
        iter=0,
        itercnt=1):
    if len(sys.argv) > 1 and not sys.argv[1][0] == '-':
        iterarg = 0  # default value
        if len(sys.argv) > 2:
            iterarg = int(sys.argv[2])
        memlim = None  # memory limit
        if len(sys.argv) > 3:
            memlim = int(sys.argv[3])
            resource.setrlimit(resource.RLIMIT_AS, (memlim, memlim))
            
        mrbase_class = loadclassname(os.environ['dumbo_mrbase_class'])
        jk_class = loadclassname(os.environ['dumbo_jk_class'])
        
        if iterarg == iter:
            if sys.argv[1].startswith('map'):
                if type(mapper) in (types.ClassType, type):
                    mappercls = type('DumboMapper', (mapper, mrbase_class), {})
                    mapper = mappercls()
                if hasattr(mapper, 'configure'):
                    mapconf = mapper.configure
                if hasattr(mapper, 'close'):
                    mapclose = mapper.close
                if hasattr(mapper, 'map'):
                    mapper = mapper.map
                if type(combiner) in (types.ClassType, type):
                    combinercls = type('DumboCombiner', (combiner, mrbase_class), {})
                    combiner = combinercls()
                if hasattr(combiner, 'configure'):
                    combconf = combiner.configure
                if hasattr(combiner, 'close'):
                    combclose = combiner.close
                if hasattr(combiner, 'reduce'):
                    combiner = combiner.reduce
                try:
                    print >> sys.stderr, "INFO: consuming %s" % \
                                         os.environ['map_input_file']
                except KeyError:
                    pass
                if os.environ.has_key('stream_map_input') and \
                os.environ['stream_map_input'].lower() == 'typedbytes':
                    print >> sys.stderr, "INFO: inputting typed bytes"
                    try: import ctypedbytes as typedbytes
                    except ImportError: import typedbytes
                    inputs = typedbytes.PairedInput(sys.stdin).reads()
                else:
                    inputs = loadcode(line[:-1] for line in sys.stdin)
                if mapconf:
                    mapconf()
                if combconf:
                    combconf()
                if os.environ.has_key('dumbo_addpath'):
                    path = os.environ['map_input_file']
                    inputs = (((path, k), v) for (k, v) in inputs)
                if os.environ.has_key('dumbo_joinkeys'):
                    inputs = ((jk_class(k), v) for (k, v) in inputs)
                if os.environ.has_key('dumbo_parser'):
                    parser = os.environ['dumbo_parser']
                    clsname = parser.split('.')[-1]          
                    modname = '.'.join(parser.split('.')[:-1])            
                    if not modname:
                        raise ImportError(parser)
                    module = __import__(modname, fromlist=[clsname])
                    parse = getattr(module, clsname)().parse
                    outputs = itermap(inputs, mapper, parse)
                elif os.environ.has_key('dumbo_record'):
                    record = os.environ['dumbo_record']
                    clsname = record.split('.')[-1]
                    modname = '.'.join(record.split('.')[:-1])
                    if not modname:
                        raise ImportError(parser)
                    module = __import__(modname, fromlist=[clsname])
                    set = getattr(module, clsname)().set
                    outputs = itermap(inputs, mapper, lambda v: set(*v))
                else:
                    outputs = itermap(inputs, mapper)
                if combiner and type(combiner) != str:
                    if (not buffersize) and memlim:
                        buffersize = int(memlim * 0.33) / 512  # educated guess
                        print >> sys.stderr, 'INFO: buffersize =', buffersize
                    inputs = sorted(outputs, buffersize)
                    if os.environ.has_key('dumbo_joinkeys'):
                        outputs = iterreduce(inputs, combiner,
                                             keyfunc=jk_class.fromjoinkey)
                    else:
                        outputs = iterreduce(inputs, combiner)
                if os.environ.has_key('dumbo_joinkeys'):
                    outputs = ((jk.dump(), v) for (jk, v) in outputs)
                if os.environ.has_key('stream_map_output') and \
                os.environ['stream_map_output'].lower() == 'typedbytes':
                    print >> sys.stderr, "INFO: outputting typed bytes"
                    try: import ctypedbytes as typedbytes
                    except ImportError: import typedbytes
                    typedbytes.PairedOutput(sys.stdout).writes(outputs)
                else:
                    for output in dumpcode(outputs):
                        print '\t'.join(output)
                if combclose:
                    combclose()
                if mapclose:
                    mapclose()
            elif reducer:
                if type(reducer) in (types.ClassType, type):
                    reducercls = type('DumboReducer', (reducer, mrbase_class), {})
                    reducer = reducercls()
                if hasattr(reducer, 'configure'):
                    redconf = reducer.configure
                if hasattr(reducer, 'close'):
                    redclose = reducer.close
                if hasattr(reducer, 'reduce'):
                    reducer = reducer.reduce
                if os.environ.has_key('stream_reduce_input') and \
                os.environ['stream_reduce_input'].lower() == 'typedbytes':
                    print >> sys.stderr, "INFO: inputting typed bytes"
                    try: import ctypedbytes as typedbytes
                    except ImportError: import typedbytes
                    inputs = typedbytes.PairedInput(sys.stdin).reads()
                else:
                    inputs = loadcode(line[:-1] for line in sys.stdin)
                if redconf:
                    redconf()
                if os.environ.has_key('dumbo_joinkeys'):
                    outputs = iterreduce(inputs, reducer,
                                         keyfunc=jk_class.fromdump)
                    outputs = ((jk.body, v) for (jk, v) in outputs)
                else:
                    outputs = iterreduce(inputs, reducer)
                if os.environ.has_key('stream_reduce_output') and \
                os.environ['stream_reduce_output'].lower() == 'typedbytes':
                    print >> sys.stderr, "INFO: outputting typed bytes"
                    try: import ctypedbytes as typedbytes
                    except ImportError: import typedbytes
                    typedbytes.PairedOutput(sys.stdout).writes(outputs)
                else:
                    for output in dumpcode(outputs):
                        print '\t'.join(output)
                if redclose:
                    redclose()
            else:
                for output in dumpcode(inputs):
                    print '\t'.join(output)
    else:
        if not opts:
            opts = []
        if type(mapper) == str:
            opts.append(('mapper', mapper))
        elif hasattr(mapper, 'opts'):
            opts += mapper.opts
        if type(reducer) == str:
            opts.append(('reducer', reducer))
        elif hasattr(reducer, 'opts'):
            opts += reducer.opts
        if type(combiner) == str:
            opts.append(('combiner', combiner))
        opts += parseargs(sys.argv[1:])
        
        outputopt = getopt(opts, 'output', delete=False)
        if not outputopt:
            print >> sys.stderr, 'ERROR: No output path specified'
            sys.exit(1)
        output = outputopt[0]
        
        newopts = {}
        newopts['iteration'] = str(iter)
        newopts['itercount'] = str(itercnt)
        preoutputsopt = getopt(opts, 'preoutputs')
        addpathopt = getopt(opts, 'addpath', delete=False)
        getpathopt = getopt(opts, 'getpath', delete=False)
        if iter != 0:
            newopts['input'] = output + "_pre" + str(iter)
            if not (preoutputsopt and preoutputsopt[0] == 'yes'):
                newopts['delinputs'] = 'yes'
            newopts['inputformat'] = 'code'
            if addpathopt and addpathopt[0] == 'yes':  # not when == 'iter'
                newopts['addpath'] = 'no'
        if iter < itercnt - 1:
            output += "_pre" + str(iter + 1)
            newopts['output'] = output
            newopts['outputformat'] = 'code'
            if getpathopt and getpathopt[0] == 'yes':  # not when == 'iter'
                newopts['getpath'] = 'no'
        if not reducer:
            newopts['numreducetasks'] = '0'
        (key, delindexes) = (None, [])
        for (index, (key, value)) in enumerate(opts):
            if newopts.has_key(key):
                delindexes.append(index)
        for delindex in reversed(delindexes):
            del opts[delindex]
        opts += newopts.iteritems()
        
        backend = get_backend(opts)

        overwriteopt = getopt(opts, 'overwrite')
        checkoutopt = getopt(opts, 'checkoutput')
        checkoutput = not (checkoutopt and checkoutopt[0] == 'no')
        fs = backend.create_filesystem(opts)
        if overwriteopt and overwriteopt[0] == 'yes':
            fs.rm(output, opts)
        elif checkoutput and fs.exists(output, opts) == 0:
            print >> sys.stderr, 'ERROR: Output path exists already: %s' % output
            sys.exit(1)
        
        opts.append(('cmdenv', 'dumbo_mrbase_class=' + \
                     getclassname(backend.get_mapredbase_class(opts))))
        opts.append(('cmdenv', 'dumbo_jk_class=' + \
                     getclassname(backend.get_joinkey_class(opts))))                  
        retval = backend.create_iteration(opts).run()
        if retval == 127:
            print >> sys.stderr, 'ERROR: Are you sure that "python" is on your path?'
        if retval != 0:
            sys.exit(retval)