def testpairio(self): objects = TestIO.objects file = open("test.bin", "wb") output = typedbytes.PairedOutput(file) output.writes(enumerate(objects)) file.close() file = open("test.bin", "rb") input = typedbytes.PairedInput(file) for index, record in input.reads(): self.assertEqual(objects[index], record) file.close() os.remove("test.bin")
def testwrongio(self): try: file = open("test.bin", "wb") output = typedbytes.Output(file) output.writes([1]) file.close() file = open("test.bin", "rb") input = typedbytes.Input(file) input = typedbytes.PairedInput(file) self.assertRaises(StructError, lambda :list(input.reads())) file.close() finally: os.remove("test.bin")
def run(mapper, reducer=None, combiner=None, buffersize=None, mapconf=None, redconf=None, combconf=None, mapclose=None, redclose=None, combclose=None, opts=None, input=None, output=None, iter=0): if len(sys.argv) > 1 and not sys.argv[1][0] == '-': iterarg = 0 # default value if len(sys.argv) > 2: iterarg = int(sys.argv[2]) memlim = None # memory limit if len(sys.argv) > 3: memlim = int(sys.argv[3]) resource.setrlimit(resource.RLIMIT_AS, (memlim, memlim)) mrbase_class = loadclassname(os.environ['dumbo_mrbase_class']) jk_class = loadclassname(os.environ['dumbo_jk_class']) runinfo = loadclassname(os.environ['dumbo_runinfo_class'])() if iterarg == iter: if sys.argv[1].startswith('map'): if type(mapper) in (types.ClassType, type): mappercls = type('DumboMapper', (mapper, mrbase_class), {}) mapper = mappercls() if hasattr(mapper, 'configure'): mapconf = mapper.configure if hasattr(mapper, 'close'): mapclose = mapper.close if hasattr(mapper, 'map'): mapper = mapper.map if type(combiner) in (types.ClassType, type): combinercls = type('DumboCombiner', (combiner, mrbase_class), {}) combiner = combinercls() if hasattr(combiner, 'configure'): combconf = combiner.configure if hasattr(combiner, 'close'): combclose = combiner.close if hasattr(combiner, 'reduce'): combiner = combiner.reduce try: print >> sys.stderr, "INFO: consuming %s" % \ os.environ['map_input_file'] except KeyError: pass if os.environ.has_key('stream_map_input') and \ os.environ['stream_map_input'].lower() == 'typedbytes': print >> sys.stderr, "INFO: inputting typed bytes" try: import ctypedbytes as typedbytes except ImportError: import typedbytes inputs = typedbytes.PairedInput(sys.stdin).reads() else: inputs = loadcode(line[:-1] for line in sys.stdin) if mapconf: mapconf() if combconf: combconf() if os.environ.has_key('dumbo_addpath'): path = runinfo.get_input_path() inputs = (((path, k), v) for (k, v) in inputs) if os.environ.has_key('dumbo_joinkeys'): inputs = ((jk_class(k), v) for (k, v) in inputs) if os.environ.has_key('dumbo_parser'): parser = os.environ['dumbo_parser'] clsname = parser.split('.')[-1] modname = '.'.join(parser.split('.')[:-1]) if not modname: raise ImportError(parser) module = __import__(modname, fromlist=[clsname]) parse = getattr(module, clsname)().parse outputs = itermap(inputs, mapper, parse) elif os.environ.has_key('dumbo_record'): record = os.environ['dumbo_record'] clsname = record.split('.')[-1] modname = '.'.join(record.split('.')[:-1]) if not modname: raise ImportError(parser) module = __import__(modname, fromlist=[clsname]) set = getattr(module, clsname)().set outputs = itermap(inputs, mapper, lambda v: set(*v)) else: outputs = itermap(inputs, mapper) if combiner and type(combiner) != str: if (not buffersize) and memlim: buffersize = int(memlim * 0.33) / 512 # educated guess print >> sys.stderr, 'INFO: buffersize =', buffersize inputs = sorted(outputs, buffersize) if os.environ.has_key('dumbo_joinkeys'): outputs = iterreduce(inputs, combiner, keyfunc=jk_class.fromjoinkey) else: outputs = iterreduce(inputs, combiner) if os.environ.has_key('dumbo_joinkeys'): outputs = ((jk.dump(), v) for (jk, v) in outputs) if os.environ.has_key('stream_map_output') and \ os.environ['stream_map_output'].lower() == 'typedbytes': print >> sys.stderr, "INFO: outputting typed bytes" try: import ctypedbytes as typedbytes except ImportError: import typedbytes typedbytes.PairedOutput(sys.stdout).writes(outputs) else: for output in dumpcode(outputs): print '\t'.join(output) if combclose: combclose() if mapclose: mapclose() elif reducer: if type(reducer) in (types.ClassType, type): reducercls = type('DumboReducer', (reducer, mrbase_class), {}) reducer = reducercls() if hasattr(reducer, 'configure'): redconf = reducer.configure if hasattr(reducer, 'close'): redclose = reducer.close if hasattr(reducer, 'reduce'): reducer = reducer.reduce if os.environ.has_key('stream_reduce_input') and \ os.environ['stream_reduce_input'].lower() == 'typedbytes': print >> sys.stderr, "INFO: inputting typed bytes" try: import ctypedbytes as typedbytes except ImportError: import typedbytes inputs = typedbytes.PairedInput(sys.stdin).reads() else: inputs = loadcode(line[:-1] for line in sys.stdin) if redconf: redconf() if os.environ.has_key('dumbo_joinkeys'): outputs = iterreduce(inputs, reducer, keyfunc=jk_class.fromdump) outputs = ((jk.body, v) for (jk, v) in outputs) else: outputs = iterreduce(inputs, reducer) if os.environ.has_key('stream_reduce_output') and \ os.environ['stream_reduce_output'].lower() == 'typedbytes': print >> sys.stderr, "INFO: outputting typed bytes" try: import ctypedbytes as typedbytes except ImportError: import typedbytes typedbytes.PairedOutput(sys.stdout).writes(outputs) else: for output in dumpcode(outputs): print '\t'.join(output) if redclose: redclose() else: for output in dumpcode(inputs): print '\t'.join(output) else: opts = Options(opts) if type(mapper) == str: opts.add('mapper', mapper) elif hasattr(mapper, 'opts'): opts += mapper.opts if type(reducer) == str: opts.add('reducer', reducer) elif hasattr(reducer, 'opts'): opts += reducer.opts if type(combiner) == str: opts.add('combiner', combiner) opts += parseargs(sys.argv[1:]) if input is not None: opts.remove('input') for infile in input: opts.add('input', infile) if output is None: outputopt = opts['output'] if not outputopt: print >> sys.stderr, 'ERROR: No output path specified' sys.exit(1) output = outputopt[0] newopts = Options() newopts.add('output', output) if not reducer: newopts.add('numreducetasks', '0') keys = [k for k, _ in opts if k in newopts] opts.remove(*keys) opts += newopts backend = get_backend(opts) overwriteopt = opts.pop('overwrite') checkoutput = 'no' not in opts.pop('checkoutput') fs = backend.create_filesystem(opts) if 'yes' in overwriteopt: fs.rm(output, opts) elif checkoutput and fs.exists(output, opts) == 0: print >> sys.stderr, 'ERROR: Output path exists already: %s' % output sys.exit(1) opts.add('cmdenv', 'dumbo_mrbase_class=' + \ getclassname(backend.get_mapredbase_class(opts))) opts.add('cmdenv', 'dumbo_jk_class=' + \ getclassname(backend.get_joinkey_class(opts))) opts.add('cmdenv', 'dumbo_runinfo_class=' + \ getclassname(backend.get_runinfo_class(opts))) retval = backend.create_iteration(opts).run() if retval == 127: print >> sys.stderr, 'ERROR: Are you sure that "python" is on your path?' if retval != 0: sys.exit(retval)
import ctypedbytes as typedbytes import sys b = typedbytes.PairedInput(sys.stdin) c = typedbytes.PairedOutput(sys.stdout) c.writes(b)