Example #1
0
def start(prog,
          opts,
          stdout=sys.stdout,
          stderr=sys.stderr):

    opts = Options(opts)
    opts += Options(configopts('common'))
    opts += Options(configopts('start'))

    pyenv = envdef('PYTHONPATH', opts['libegg'],
                   shortcuts=dict(configopts('eggs', prog)),
                   extrapaths=sys.path)

    if not opts['prog']:
        opts.add('prog', prog)

    if not os.path.exists(prog):
        if prog.endswith(".py"):
            print >> sys.stderr, 'ERROR:', prog, 'does not exist'
            return 1
        prog = '-m ' + prog

    return execute("%s %s" % (sys.executable, prog),
                   opts,
                   pyenv,
                   stdout=stdout,
                   stderr=stderr,
                   printcmd=False)
Example #2
0
class RawReducer(object):
    """Reducer to generate outputs in raw file format"""

    multipleoutput = False
    singleopts = Options([
        ('outputformat', 'raw'),
    ])
    multipleopts = Options([
        ('getpath', 'yes'),
        ('outputformat', 'raw'),
        ('partitioner', 'fm.last.feathers.partition.Prefix'),
        ('jobconf', 'feathers.output.filename.strippart=true'),
    ])

    def __init__(self, factory=None, multipleoutput=None):
        if factory:
            self.factory = factory
        if multipleoutput is not None:
            self.multipleoutput = multipleoutput
        self.opts = self.multipleopts if self.multipleoutput else self.singleopts

    def __call__(self, data):
        if not self.multipleoutput:
            data = (((None, key), values) for key, values in data)

        proc = self.factory()
        for path, group in groupby(data, lambda x: x[0][0]):
            proc = self.factory()
            for (_, key), values in group:
                for chk in proc(key, values) or ():
                    yield path, chk

            close = getattr(proc, 'close', tuple)
            for chk in close() or ():
                yield path, chk

    def factory(self):
        """Processor factory used to consume reducer input (one per path on multiple outputs)

        Must return a callable (aka processor) that accepts two parameters
        "key" and "values", and returns an iterable of strings or None.

        The processor may have a close() method that returns an iterable of
        strings or None. This method is called when the last key-values pair
        for a path is seen.

        """
        return lambda key, values: values
Example #3
0
class JoinCombiner(object):

    opts = Options([("joinkeys", "yes")])

    def __call__(self, key, values):
        if key.isprimary:
            self._key = key.body
            for k, v in self.primary(key.body, values) or ():
                jk = copy(key)
                jk.body = k
                yield jk, v
        elif not self.secondary_blocked(key.body):
            for k, v in self.secondary(key.body, values) or ():
                jk = copy(key)
                jk.body = k
                yield jk, v

    def secondary_blocked(self, key_body):
        '''Determines if the secondary method should be blocked or not.'''
        return False

    def primary(self, key, values):
        for value in values:
            yield key, value

    def secondary(self, key, values):
        for value in values:
            yield key, value
Example #4
0
 def __call__(self, func):
     if hasattr(func, 'opts'):
         key, value = self.opt
         func.opts.add(key, value)
     else:
         func.opts = Options([self.opt])
     return func
Example #5
0
 def __init__(self, mapper, isprimary=False):
     self.mapper = mapper
     self.isprimary = isprimary
     self.opts = Options([('joinkeys', 'yes')])
     if hasattr(mapper, 'opts'):
         self.opts += self.mapper.opts
     self.closefunc = None
Example #6
0
 def testitertwice(self):
     opts = self.common_opts
     opts += Options([('input', self.exdir + 'brian.txt'),
                      ('output', self.outfile)])
     retval = cmd.start(self.exdir + 'itertwice.py',
                        opts,
                        stdout=self.logfile,
                        stderr=self.logfile)
     self.assertEqual(0, retval)
     output = dict(util.loadcode(open(self.outfile)))
     self.assertEqual(14, int(output['e']))
Example #7
0
File: cmd.py Project: andrix/dumbo
def decodepipe(opts=None):
    opts = opts or Options()
    ofiles = opts.pop('file')
    files = map(open, ofiles) if ofiles else [sys.stdin]

    for _file in files:
        outputs = loadcode(line[:-1] for line in _file)
        for output in dumptext(outputs):
            print '\t'.join(output)
        _file.close()
        return 0
Example #8
0
 def testoowordcount(self):
     opts = self.common_opts
     opts += Options([('excludes', self.exdir + 'excludes.txt'),
                      ('input', self.exdir + 'brian.txt'),
                      ('output', self.outfile)])
     retval = cmd.start(self.exdir + 'oowordcount.py',
                        opts,
                        stdout=self.logfile,
                        stderr=self.logfile)
     self.assertEquals(0, retval)
     output = dict(util.loadcode(open(self.outfile)))
     self.assertEquals(6, int(output['Brian']))
Example #9
0
 def testjoin(self):
     opts = self.common_opts
     opts += Options([('input', self.exdir + 'hostnames.txt'),
                      ('input', self.exdir + 'logs.txt'),
                      ('output', self.outfile)])
     retval = cmd.start(self.exdir + 'join.py',
                        opts,
                        stdout=self.logfile,
                        stderr=self.logfile)
     self.assertEqual(0, retval)
     output = dict(util.loadcode(open(self.outfile)))
     self.assertEqual(5, int(output['node1']))
Example #10
0
def start(prog, opts, stdout=sys.stdout, stderr=sys.stderr):

    opts = Options(opts)
    opts += Options(configopts('common'))
    opts += Options(configopts('start'))

    pyenv = envdef('PYTHONPATH',
                   opts['libegg'],
                   shortcuts=dict(configopts('eggs', prog)),
                   extrapaths=sys.path)

    if not opts['prog']:
        opts.add('prog', prog)

    if not os.path.exists(prog):
        if prog.endswith(".py"):
            print >> sys.stderr, 'ERROR:', prog, 'does not exist'
            return 1
        prog = '-m ' + prog

    return execute("%s %s" % (sys.executable, prog),
                   opts,
                   pyenv,
                   stdout=stdout,
                   stderr=stderr,
                   printcmd=False)
Example #11
0
 def testmulticount(self):
     opts = self.common_opts
     opts += Options([('input', self.exdir + 'brian.txt'),
                      ('input', self.exdir + 'eno.txt'),
                      ('output', self.outfile)])
     retval = cmd.start(self.exdir + 'multicount.py',
                        opts,
                        stdout=self.logfile,
                        stderr=self.logfile)
     self.assertEqual(0, retval)
     output = dict(util.loadcode(open(self.outfile)))
     self.assertEqual(6, int(output[('A', 'Brian')]))
     self.assertEqual(6, int(output[('B', 'Eno')]))
Example #12
0
 def setUp(self):
     if "directory" in os.environ:
         rootdir = os.environ["directory"]
         self.exdir = rootdir + "/examples/"
         self.tstdir = rootdir + "/tests/"
     elif "/" in __file__:
         self.exdir = __file__.split("tests/")[0] + "examples/"
         self.tstdir = "/".join(__file__.split("/")[:-1]) + "/"
     else:
         self.exdir = "../examples/"
         self.tstdir = "./"
     self.logfile = open(self.tstdir + "log.txt", "w")
     self.outfile = self.tstdir + "output.code"
     self.common_opts = Options([('checkoutput', 'no')])
Example #13
0
File: cmd.py Project: andrix/dumbo
def encodepipe(opts=None):
    opts = opts or Options()
    keys = ['addpath', 'file', 'alreadycoded']
    addedopts = opts.filter(keys)
    opts.remove(*keys)

    ofiles = addedopts['file']
    files = map(open, ofiles) if ofiles else [sys.stdin]

    loadfun = loadcode if addedopts['alreadycoded'] else loadtext
    addpath = addedopts['addpath']

    for _file in files:
        outputs = loadfun(line[:-1] for line in _file)
        if addpath:
            outputs = (((_file.name, key), value) for (key, value) in outputs)
        for output in dumpcode(outputs):
            print '\t'.join(output)
        _file.close()
    return 0
Example #14
0
 def __init__(self):
     self.mappers = []
     self.opts = Options([("addpath", "iter")])
Example #15
0
 def __init__(self, prog, opts):
     Iteration.__init__(self, prog, opts)
     self.opts += Options(configopts('streaming', prog, self.opts))
     hadoop_streaming = 'streaming_%s' % self.opts['hadoop'][0]
     self.opts += Options(configopts(hadoop_streaming, prog, self.opts))
Example #16
0
File: unix.py Project: zmtmei/dumbo
 def cat(self, path, opts):
     opts = Options(opts)
     opts.add('file', path)
     return decodepipe(opts)
Example #17
0
File: cmd.py Project: andrix/dumbo
def ls(path, opts):
    opts += Options(configopts('common'))
    opts += Options(configopts('ls'))
    return create_filesystem(opts).ls(path, opts)
Example #18
0
File: cmd.py Project: andrix/dumbo
def cat(path, opts):
    opts += Options(configopts('common'))
    opts += Options(configopts('cat'))
    return create_filesystem(opts).cat(path, opts)
Example #19
0
File: unix.py Project: zmtmei/dumbo
 def __init__(self, prog, opts):
     Iteration.__init__(self, prog, opts)
     self.opts += Options(configopts('unix', prog, self.opts))
Example #20
0
File: cmd.py Project: andrix/dumbo
def get(path1, path2, opts):
    opts += Options(configopts('common'))
    opts += Options(configopts('get'))
    return create_filesystem(opts).get(path1, path2, opts)
Example #21
0
File: cmd.py Project: andrix/dumbo
def rm(path, opts):
    opts += Options(configopts('common'))
    opts += Options(configopts('rm'))
    return create_filesystem(opts).rm(path, opts)
Example #22
0
File: unix.py Project: jso/dumbo
 def cat(self, path, opts):
     opts = Options(opts)
     opts.add('file', path)
     return decodepipe(opts)
Example #23
0
def put(path1, path2, opts):
    opts = Options(opts)
    opts += Options(configopts('common'))
    opts += Options(configopts('put'))
    return create_filesystem(opts).put(path1, path2, opts)
Example #24
0
File: cmd.py Project: andrix/dumbo
def exists(path, opts):
    opts += Options(configopts('common'))
    opts += Options(configopts('exists'))
    return create_filesystem(opts).exists(path, opts)
Example #25
0
    def test_Options(self):
        o = Options([('param', 'p1')])
        # test add / get
        o.add('param', 'p2')

        # test repeat add same parameter
        o.add('param', 'p2')
        o.add('input', '/dev/path')
        o.add('output', '/dev/out')
        self.assertEquals(set(o.get('param')), set(['p1', 'p2']))
        self.assertEquals(o.get('input'), ['/dev/path'])
        self.assertEquals(o.get('notexist'), [])

        # test __getitem__
        self.assertEquals(set(o['param']), set(['p1', 'p2']))
        self.assertEquals(o['input'], ['/dev/path'])
        self.assertEquals(o['notexist'], [])

        # test __delitem__
        self.assertEquals(o['output'], ['/dev/out'])
        del o['output']
        self.assertEquals(o['output'], [])

        # test __iadd__
        # adding Options objects
        o += Options([('output', '/dev/out2'), ('jar', 'my.jar')])
        self.assertEquals(o['output'], ['/dev/out2'])
        self.assertEquals(o['jar'], ['my.jar'])
        # adding a list & set
        o += [('param', 'p3'), ('egg', 'lib.egg')]
        self.assertEquals(set(o['param']), set(['p1', 'p2', 'p3']))
        self.assertEquals(o['egg'], ['lib.egg'])

        o += set([('cmdenv', 'p=2')])
        self.assertEquals(o['cmdenv'], ['p=2'])

        # testing iter / allopts
        o2 = Options([('param', 'p1')])
        o2.add('param', 'p2')
        o2.add('input', '/dev/path')
        self.assertEquals(set(o2), set([('param', 'p1'), ('param', 'p2'), ('input', '/dev/path')]))
        self.assertEquals(set(o2.allopts()), set([('param', 'p1'), ('param', 'p2'), ('input', '/dev/path')]))


        # testing len
        self.assertEquals(len(o), 8)
        self.assertEquals(len(o2), 3)
        self.assertEquals(len(Options()), 0)

        # testing boolean
        self.assertTrue(o)
        self.assertTrue(o2)
        self.assertFalse(Options())

        # testing filter
        self.assertEquals(set(o2.filter(['param'])['param']), set(['p1', 'p2']))
        self.assertEquals(o2.filter(['input'])['input'], ['/dev/path'])

        nop = o.filter(['param', 'jar', 'egg'])
        self.assertEquals(len(nop), 5)
        self.assertEquals(set(nop['param']), set(['p1', 'p2', 'p3']))
        self.assertEquals(nop['jar'], ['my.jar'])
        self.assertEquals(nop['egg'], ['lib.egg'])

        # testing to_dict
        expected = {
            'param': ['p1', 'p2', 'p3'],
            'egg': ['lib.egg'],
            'jar': ['my.jar']
        }
        self.assertEquals(nop.to_dict(), expected)

        # testing remove
        nop.remove('param', 'jar')
        self.assertEquals(len(nop), 1)
        self.assertEquals(nop['param'], [])
        self.assertEquals(nop['jar'], [])
        self.assertEquals(nop['egg'], ['lib.egg'])

        # testing pop
        self.assertEquals(nop.pop('egg'), ['lib.egg'])
        self.assertEquals(len(nop), 0)
        self.assertEquals(nop['egg'], [])

        self.assertEquals(nop.pop('notexist'), [])