def start(prog, opts, stdout=sys.stdout, stderr=sys.stderr): opts = Options(opts) opts += Options(configopts('common')) opts += Options(configopts('start')) pyenv = envdef('PYTHONPATH', opts['libegg'], shortcuts=dict(configopts('eggs', prog)), extrapaths=sys.path) if not opts['prog']: opts.add('prog', prog) if not os.path.exists(prog): if prog.endswith(".py"): print >> sys.stderr, 'ERROR:', prog, 'does not exist' return 1 prog = '-m ' + prog return execute("%s %s" % (sys.executable, prog), opts, pyenv, stdout=stdout, stderr=stderr, printcmd=False)
class RawReducer(object): """Reducer to generate outputs in raw file format""" multipleoutput = False singleopts = Options([ ('outputformat', 'raw'), ]) multipleopts = Options([ ('getpath', 'yes'), ('outputformat', 'raw'), ('partitioner', 'fm.last.feathers.partition.Prefix'), ('jobconf', 'feathers.output.filename.strippart=true'), ]) def __init__(self, factory=None, multipleoutput=None): if factory: self.factory = factory if multipleoutput is not None: self.multipleoutput = multipleoutput self.opts = self.multipleopts if self.multipleoutput else self.singleopts def __call__(self, data): if not self.multipleoutput: data = (((None, key), values) for key, values in data) proc = self.factory() for path, group in groupby(data, lambda x: x[0][0]): proc = self.factory() for (_, key), values in group: for chk in proc(key, values) or (): yield path, chk close = getattr(proc, 'close', tuple) for chk in close() or (): yield path, chk def factory(self): """Processor factory used to consume reducer input (one per path on multiple outputs) Must return a callable (aka processor) that accepts two parameters "key" and "values", and returns an iterable of strings or None. The processor may have a close() method that returns an iterable of strings or None. This method is called when the last key-values pair for a path is seen. """ return lambda key, values: values
class JoinCombiner(object): opts = Options([("joinkeys", "yes")]) def __call__(self, key, values): if key.isprimary: self._key = key.body for k, v in self.primary(key.body, values) or (): jk = copy(key) jk.body = k yield jk, v elif not self.secondary_blocked(key.body): for k, v in self.secondary(key.body, values) or (): jk = copy(key) jk.body = k yield jk, v def secondary_blocked(self, key_body): '''Determines if the secondary method should be blocked or not.''' return False def primary(self, key, values): for value in values: yield key, value def secondary(self, key, values): for value in values: yield key, value
def __call__(self, func): if hasattr(func, 'opts'): key, value = self.opt func.opts.add(key, value) else: func.opts = Options([self.opt]) return func
def __init__(self, mapper, isprimary=False): self.mapper = mapper self.isprimary = isprimary self.opts = Options([('joinkeys', 'yes')]) if hasattr(mapper, 'opts'): self.opts += self.mapper.opts self.closefunc = None
def testitertwice(self): opts = self.common_opts opts += Options([('input', self.exdir + 'brian.txt'), ('output', self.outfile)]) retval = cmd.start(self.exdir + 'itertwice.py', opts, stdout=self.logfile, stderr=self.logfile) self.assertEqual(0, retval) output = dict(util.loadcode(open(self.outfile))) self.assertEqual(14, int(output['e']))
def decodepipe(opts=None): opts = opts or Options() ofiles = opts.pop('file') files = map(open, ofiles) if ofiles else [sys.stdin] for _file in files: outputs = loadcode(line[:-1] for line in _file) for output in dumptext(outputs): print '\t'.join(output) _file.close() return 0
def testoowordcount(self): opts = self.common_opts opts += Options([('excludes', self.exdir + 'excludes.txt'), ('input', self.exdir + 'brian.txt'), ('output', self.outfile)]) retval = cmd.start(self.exdir + 'oowordcount.py', opts, stdout=self.logfile, stderr=self.logfile) self.assertEquals(0, retval) output = dict(util.loadcode(open(self.outfile))) self.assertEquals(6, int(output['Brian']))
def testjoin(self): opts = self.common_opts opts += Options([('input', self.exdir + 'hostnames.txt'), ('input', self.exdir + 'logs.txt'), ('output', self.outfile)]) retval = cmd.start(self.exdir + 'join.py', opts, stdout=self.logfile, stderr=self.logfile) self.assertEqual(0, retval) output = dict(util.loadcode(open(self.outfile))) self.assertEqual(5, int(output['node1']))
def testmulticount(self): opts = self.common_opts opts += Options([('input', self.exdir + 'brian.txt'), ('input', self.exdir + 'eno.txt'), ('output', self.outfile)]) retval = cmd.start(self.exdir + 'multicount.py', opts, stdout=self.logfile, stderr=self.logfile) self.assertEqual(0, retval) output = dict(util.loadcode(open(self.outfile))) self.assertEqual(6, int(output[('A', 'Brian')])) self.assertEqual(6, int(output[('B', 'Eno')]))
def setUp(self): if "directory" in os.environ: rootdir = os.environ["directory"] self.exdir = rootdir + "/examples/" self.tstdir = rootdir + "/tests/" elif "/" in __file__: self.exdir = __file__.split("tests/")[0] + "examples/" self.tstdir = "/".join(__file__.split("/")[:-1]) + "/" else: self.exdir = "../examples/" self.tstdir = "./" self.logfile = open(self.tstdir + "log.txt", "w") self.outfile = self.tstdir + "output.code" self.common_opts = Options([('checkoutput', 'no')])
def encodepipe(opts=None): opts = opts or Options() keys = ['addpath', 'file', 'alreadycoded'] addedopts = opts.filter(keys) opts.remove(*keys) ofiles = addedopts['file'] files = map(open, ofiles) if ofiles else [sys.stdin] loadfun = loadcode if addedopts['alreadycoded'] else loadtext addpath = addedopts['addpath'] for _file in files: outputs = loadfun(line[:-1] for line in _file) if addpath: outputs = (((_file.name, key), value) for (key, value) in outputs) for output in dumpcode(outputs): print '\t'.join(output) _file.close() return 0
def __init__(self): self.mappers = [] self.opts = Options([("addpath", "iter")])
def __init__(self, prog, opts): Iteration.__init__(self, prog, opts) self.opts += Options(configopts('streaming', prog, self.opts)) hadoop_streaming = 'streaming_%s' % self.opts['hadoop'][0] self.opts += Options(configopts(hadoop_streaming, prog, self.opts))
def cat(self, path, opts): opts = Options(opts) opts.add('file', path) return decodepipe(opts)
def ls(path, opts): opts += Options(configopts('common')) opts += Options(configopts('ls')) return create_filesystem(opts).ls(path, opts)
def cat(path, opts): opts += Options(configopts('common')) opts += Options(configopts('cat')) return create_filesystem(opts).cat(path, opts)
def __init__(self, prog, opts): Iteration.__init__(self, prog, opts) self.opts += Options(configopts('unix', prog, self.opts))
def get(path1, path2, opts): opts += Options(configopts('common')) opts += Options(configopts('get')) return create_filesystem(opts).get(path1, path2, opts)
def rm(path, opts): opts += Options(configopts('common')) opts += Options(configopts('rm')) return create_filesystem(opts).rm(path, opts)
def put(path1, path2, opts): opts = Options(opts) opts += Options(configopts('common')) opts += Options(configopts('put')) return create_filesystem(opts).put(path1, path2, opts)
def exists(path, opts): opts += Options(configopts('common')) opts += Options(configopts('exists')) return create_filesystem(opts).exists(path, opts)
def test_Options(self): o = Options([('param', 'p1')]) # test add / get o.add('param', 'p2') # test repeat add same parameter o.add('param', 'p2') o.add('input', '/dev/path') o.add('output', '/dev/out') self.assertEquals(set(o.get('param')), set(['p1', 'p2'])) self.assertEquals(o.get('input'), ['/dev/path']) self.assertEquals(o.get('notexist'), []) # test __getitem__ self.assertEquals(set(o['param']), set(['p1', 'p2'])) self.assertEquals(o['input'], ['/dev/path']) self.assertEquals(o['notexist'], []) # test __delitem__ self.assertEquals(o['output'], ['/dev/out']) del o['output'] self.assertEquals(o['output'], []) # test __iadd__ # adding Options objects o += Options([('output', '/dev/out2'), ('jar', 'my.jar')]) self.assertEquals(o['output'], ['/dev/out2']) self.assertEquals(o['jar'], ['my.jar']) # adding a list & set o += [('param', 'p3'), ('egg', 'lib.egg')] self.assertEquals(set(o['param']), set(['p1', 'p2', 'p3'])) self.assertEquals(o['egg'], ['lib.egg']) o += set([('cmdenv', 'p=2')]) self.assertEquals(o['cmdenv'], ['p=2']) # testing iter / allopts o2 = Options([('param', 'p1')]) o2.add('param', 'p2') o2.add('input', '/dev/path') self.assertEquals(set(o2), set([('param', 'p1'), ('param', 'p2'), ('input', '/dev/path')])) self.assertEquals(set(o2.allopts()), set([('param', 'p1'), ('param', 'p2'), ('input', '/dev/path')])) # testing len self.assertEquals(len(o), 8) self.assertEquals(len(o2), 3) self.assertEquals(len(Options()), 0) # testing boolean self.assertTrue(o) self.assertTrue(o2) self.assertFalse(Options()) # testing filter self.assertEquals(set(o2.filter(['param'])['param']), set(['p1', 'p2'])) self.assertEquals(o2.filter(['input'])['input'], ['/dev/path']) nop = o.filter(['param', 'jar', 'egg']) self.assertEquals(len(nop), 5) self.assertEquals(set(nop['param']), set(['p1', 'p2', 'p3'])) self.assertEquals(nop['jar'], ['my.jar']) self.assertEquals(nop['egg'], ['lib.egg']) # testing to_dict expected = { 'param': ['p1', 'p2', 'p3'], 'egg': ['lib.egg'], 'jar': ['my.jar'] } self.assertEquals(nop.to_dict(), expected) # testing remove nop.remove('param', 'jar') self.assertEquals(len(nop), 1) self.assertEquals(nop['param'], []) self.assertEquals(nop['jar'], []) self.assertEquals(nop['egg'], ['lib.egg']) # testing pop self.assertEquals(nop.pop('egg'), ['lib.egg']) self.assertEquals(len(nop), 0) self.assertEquals(nop['egg'], []) self.assertEquals(nop.pop('notexist'), [])