Ejemplo n.º 1
0
 def cat(self, path, opts):
     addedopts = getopts(opts, ['libjar'], delete=False)
     streamingjar = findjar(self.hadoop, 'streaming')
     if not streamingjar:
         print >> sys.stderr, 'ERROR: Streaming jar not found'
         return 1
     hadenv = envdef('HADOOP_CLASSPATH', addedopts['libjar'],
                     shortcuts=dict(configopts('jars')))
     try:
         import typedbytes
         ls = os.popen('%s %s/bin/hadoop dfs -ls %s' % (hadenv, self.hadoop, path))
         if sum(c in path for c in ("*", "?", "{")) > 0:
             # cat each file separately when the path contains special chars
             lineparts = (line.split()[-1] for line in ls)
             subpaths = [part for part in lineparts if part.startswith("/")]
         else:
             # we still do the ls even in this case to make sure we print errors 
             subpaths = [path]
         ls.close()
         for subpath in subpaths:
             dumptb = os.popen('%s %s/bin/hadoop jar %s dumptb %s 2> /dev/null'
                               % (hadenv, self.hadoop, streamingjar, subpath))
             ascodeopt = getopt(opts, 'ascode')
             if ascodeopt and ascodeopt[0] == 'yes':
                 outputs = dumpcode(typedbytes.PairedInput(dumptb))
             else:
                 outputs = dumptext(typedbytes.PairedInput(dumptb))
             for output in outputs:
                 print '\t'.join(output)
             dumptb.close()
     except IOError:
         pass  # ignore
     return 0
Ejemplo n.º 2
0
 def cat(self, path, opts):
     addedopts = getopts(opts, ['libjar'], delete=False)
     streamingjar = findjar(self.hadoop, 'streaming')
     if not streamingjar:
         print >> sys.stderr, 'ERROR: Streaming jar not found'
         return 1
     hadenv = envdef('HADOOP_CLASSPATH', addedopts['libjar'],
                     shortcuts=dict(configopts('jars')))
     try:
         import typedbytes
         ls = os.popen('%s %s dfs -ls %s' % (hadenv, self.hdfs, path))
         if sum(c in path for c in ("*", "?", "{")) > 0:
             # cat each file separately when the path contains special chars
             lineparts = (line.split()[-1] for line in ls)
             subpaths = [part for part in lineparts if part.startswith("/")]
         else:
             # we still do the ls even in this case to make sure we print errors 
             subpaths = [path]
         ls.close()
         for subpath in subpaths:
             if subpath.endswith("/_logs"):
                 continue
             dumptb = os.popen('%s %s/bin/hadoop jar %s dumptb %s 2> /dev/null'
                               % (hadenv, self.hadoop, streamingjar, subpath))
             ascodeopt = getopt(opts, 'ascode')
             if ascodeopt and ascodeopt[0] == 'yes':
                 outputs = dumpcode(typedbytes.PairedInput(dumptb))
             else:
                 outputs = dumptext(typedbytes.PairedInput(dumptb))
             for output in outputs:
                 print '\t'.join(output)
             dumptb.close()
     except IOError:
         pass  # ignore
     return 0
    def run(self):
        retval = Iteration.run(self)
        if retval != 0:
            return retval
        addedopts = getopts(self.opts, [
            'input', 'output', 'mapper', 'reducer', 'libegg', 'delinputs',
            'cmdenv', 'pv', 'addpath', 'inputformat', 'outputformat',
            'numreducetasks', 'python', 'pypath', 'sorttmpdir', 'sortbufsize'
        ])
        (mapper, reducer) = (addedopts['mapper'][0], addedopts['reducer'][0])
        if not addedopts['input'] or not addedopts['output']:
            print >> sys.stderr, 'ERROR: input or output not specified'
            return 1
        inputs = reduce(operator.concat,
                        (input.split(' ') for input in addedopts['input']))
        output = addedopts['output'][0]
        pyenv = envdef('PYTHONPATH',
                       addedopts['libegg'],
                       shortcuts=dict(configopts('eggs', self.prog)),
                       extrapaths=addedopts['pypath'])
        cmdenv = ' '.join("%s='%s'" % tuple(arg.split('='))
                          for arg in addedopts['cmdenv'])
        if addedopts['pv'] and addedopts['pv'][0] == 'yes':
            mpv = '| pv -s `du -b %s | cut -f 1` -cN map ' % ' '.join(inputs)
            (spv, rpv) = ('| pv -cN sort ', '| pv -cN reduce ')
        else:
            (mpv, spv, rpv) = ('', '', '')

        (sorttmpdir, sortbufsize) = ('', '')
        if addedopts['sorttmpdir']:
            sorttmpdir = "-T %s" % addedopts['sorttmpdir'][0]
        if addedopts['sortbufsize']:
            sortbufsize = "-S %s" % addedopts['sortbufsize'][0]

        python = addedopts['python'][0]
        encodepipe = pyenv + ' ' + python + \
                     ' -m dumbo.cmd encodepipe -file ' + ' -file '.join(inputs)
        if addedopts['inputformat'] and addedopts['inputformat'][0] == 'code':
            encodepipe += ' -alreadycoded yes'
        if addedopts['addpath'] and addedopts['addpath'][0] != 'no':
            encodepipe += ' -addpath yes'
        if addedopts['numreducetasks'] and addedopts['numreducetasks'][
                0] == '0':
            retval = execute("%s | %s %s %s %s > '%s'" %
                             (encodepipe, pyenv, cmdenv, mapper, mpv, output))
        else:
            retval = execute(
                "%s | %s %s %s %s| LC_ALL=C sort %s %s %s| %s %s %s %s> '%s'" %
                (encodepipe, pyenv, cmdenv, mapper, mpv, sorttmpdir,
                 sortbufsize, spv, pyenv, cmdenv, reducer, rpv, output))
        if addedopts['delinputs'] and addedopts['delinputs'][0] == 'yes':
            for file in addedopts['input']:
                execute('rm ' + file)
        return retval
Ejemplo n.º 4
0
    def test_getopts(self):
        # Test for backward compatibility
        opts = []
        values = getopts(opts, ['input'])
        self.assertEquals(values, {})
        self.assertEquals(opts, [])

        opts = [('param', 'p1'), ('param', 'p2'), ('input', '/dev/path'),
                ('output', '/prod/path')]
        values = getopts(opts, ['param', 'input'])
        expected = {'input': ['/dev/path'], 'param': ['p2', 'p1']}
        settize = lambda _dict: set([(k, tuple(sorted(v))) for k, v in _dict.items()])
        self.assertEquals(settize(values), settize(expected))
        self.assertEquals(set(opts), set([('output', '/prod/path')]))

        opts = [('output', '/prod/path')]
        values = getopts(opts, ['output'], delete=False)
        self.assertEquals(values, {'output': ['/prod/path']})
        self.assertEquals(opts, [('output', '/prod/path')])

        values = getopts(opts, ['output'])
        self.assertEquals(values, {'output': ['/prod/path']})
        self.assertEquals(opts, [])
Ejemplo n.º 5
0
    def run(self):
        addedopts = getopts(self.opts, ['fake',
                                        'debug',
                                        'python',
                                        'iteration',
                                        'itercount',
                                        'hadoop',
                                        'starter',
                                        'name',
                                        'memlimit',
                                        'param',
                                        'parser',
                                        'record',
                                        'joinkeys',
                                        'hadoopconf',
                                        'mapper',
                                        'reducer',
                                        'partitioner'])
        if addedopts['fake'] and addedopts['fake'][0] == 'yes':
            def dummysystem(*args, **kwargs):
                return 0
            global system
            system = dummysystem  # not very clean, but it works...
        if addedopts['debug'] and addedopts['debug'][0] == 'yes':
            self.opts.append(('cmdenv', 'dumbo_debug=yes'))
        if not addedopts['python']:
            python = 'python'
        else:
            python = addedopts['python'][0]
        self.opts.append(('python', python))
        if not addedopts['iteration']:
            iter = 0
        else:
            iter = int(addedopts['iteration'][0])
        if not addedopts['itercount']:
            itercnt = 1
        else:
            itercnt = int(addedopts['itercount'][0])
        if addedopts['name']:
            name = addedopts['name'][0]
        else:
            name = self.prog.split('/')[-1]
        self.opts.append(('name', '%s (%s/%s)' % (name, iter + 1,
                         itercnt)))
        if not addedopts['hadoop']:
            pypath = '/'.join(self.prog.split('/')[:-1])
            if pypath: self.opts.append(('pypath', pypath))
        else:
            self.opts.append(('hadoop', addedopts['hadoop'][0]))
        progmod = self.prog.split('/')[-1]
        
        if progmod.endswith('.py'):
            progmod = progmod[:-3] 
           
        memlim = ' 262144000'  # 250MB limit by default
        if addedopts['memlimit']:
            # Limit amount of memory. This supports syntax 
            # of the form '256m', '12g' etc.
            try:
                _memlim = int(addedopts['memlimit'][0][:-1])
                memlim = ' %i' % {
                    'g': 1073741824    * _memlim,
                    'm': 1048576       * _memlim,
                    'k': 1024          * _memlim,
                    'b': 1             * _memlim,
                }[addedopts['memlimit'][0][-1].lower()]
            except KeyError:
                # Assume specified in bytes by default
                memlim = ' ' + addedopts['memlimit'][0]

        if addedopts['mapper']:
            self.opts.append(('mapper', addedopts['mapper'][0]))
        else:
            self.opts.append(('mapper', '%s -m %s map %i%s' % (python,
                             progmod, iter, memlim)))
        if addedopts['reducer']:
            self.opts.append(('reducer', addedopts['reducer'][0]))
        else:
            self.opts.append(('reducer', '%s -m %s red %i%s' % (python,
                             progmod, iter, memlim)))
        for param in addedopts['param']:
            self.opts.append(('cmdenv', param))
        if addedopts['parser'] and iter == 0:
            parser = addedopts['parser'][0]
            shortcuts = dict(configopts('parsers', self.prog))
            if parser in shortcuts:
                parser = shortcuts[parser]
            self.opts.append(('cmdenv', 'dumbo_parser=' + parser))
        if addedopts['record'] and iter == 0:
            record = addedopts['record'][0]
            shortcuts = dict(configopts('records', self.prog))
            if record in shortcuts:
                record = shortcuts[record]
            self.opts.append(('cmdenv', 'dumbo_record=' + record))
        if addedopts['joinkeys'] and addedopts['joinkeys'][0] == 'yes':
            self.opts.append(('cmdenv', 'dumbo_joinkeys=yes'))
            self.opts.append(('partitioner',
                              'org.apache.hadoop.mapred.lib.BinaryPartitioner'))
            self.opts.append(('jobconf',
                              'mapred.binary.partitioner.right.offset=-6'))
            # TODO throw an error if they also specified a partitioner
        elif addedopts['partitioner']:
            # only add a partioner if they didn't specify join-keys
            self.opts.append(('partitioner', addedopts['partitioner'][0]))
        for hadoopconf in addedopts['hadoopconf']:
            self.opts.append(('jobconf', hadoopconf))
        
        self.opts.append(('libegg', re.sub('\.egg.*$', '.egg', __file__)))
        return 0
Ejemplo n.º 6
0
 def run(self):
     retval = Iteration.run(self)
     if retval != 0:
         return retval
     if os.path.exists(self.prog):
         self.opts.append(("file", self.prog))
     addedopts = getopts(
         self.opts,
         [
             "hadoop",
             "name",
             "delinputs",
             "libegg",
             "libjar",
             "inputformat",
             "outputformat",
             "nummaptasks",
             "numreducetasks",
             "priority",
             "queue",
             "cachefile",
             "cachearchive",
             "file",
             "codewritable",
             "addpath",
             "getpath",
             "python",
             "streamoutput",
             "pypath",
         ],
     )
     hadoop = findhadoop(addedopts["hadoop"][0])
     streamingjar = findjar(hadoop, "streaming")
     if not streamingjar:
         print >> sys.stderr, "ERROR: Streaming jar not found"
         return 1
     try:
         import typedbytes
     except ImportError:
         print >> sys.stderr, 'ERROR: "typedbytes" module not found'
         return 1
     modpath = re.sub("\.egg.*$", ".egg", typedbytes.__file__)
     if modpath.endswith(".egg"):
         addedopts["libegg"].append(modpath)
     else:
         self.opts.append(("file", modpath))
     self.opts.append(("jobconf", "stream.map.input=typedbytes"))
     self.opts.append(("jobconf", "stream.reduce.input=typedbytes"))
     if addedopts["numreducetasks"] and addedopts["numreducetasks"][0] == "0":
         self.opts.append(("jobconf", "stream.reduce.output=typedbytes"))
         if addedopts["streamoutput"]:
             id_ = addedopts["streamoutput"][0]
             self.opts.append(("jobconf", "stream.map.output=" + id_))
         else:
             self.opts.append(("jobconf", "stream.map.output=typedbytes"))
     else:
         self.opts.append(("jobconf", "stream.map.output=typedbytes"))
         if addedopts["streamoutput"]:
             id_ = addedopts["streamoutput"][0]
             self.opts.append(("jobconf", "stream.reduce.output=" + id_))
         else:
             self.opts.append(("jobconf", "stream.reduce.output=typedbytes"))
     if not addedopts["name"]:
         self.opts.append(("jobconf", "mapred.job.name=" + self.prog.split("/")[-1]))
     else:
         self.opts.append(("jobconf", "mapred.job.name=%s" % addedopts["name"][0]))
     if addedopts["nummaptasks"]:
         self.opts.append(("jobconf", "mapred.map.tasks=%s" % addedopts["nummaptasks"][0]))
     if addedopts["numreducetasks"]:
         numreducetasks = int(addedopts["numreducetasks"][0])
         self.opts.append(("numReduceTasks", str(numreducetasks)))
     if addedopts["priority"]:
         self.opts.append(("jobconf", "mapred.job.priority=%s" % addedopts["priority"][0]))
     if addedopts["queue"]:
         self.opts.append(("jobconf", "mapred.job.queue.name=%s" % addedopts["queue"][0]))
     if addedopts["cachefile"]:
         for cachefile in addedopts["cachefile"]:
             self.opts.append(("cacheFile", cachefile))
     if addedopts["cachearchive"]:
         for cachearchive in addedopts["cachearchive"]:
             self.opts.append(("cacheArchive", cachearchive))
     if addedopts["file"]:
         for file in addedopts["file"]:
             if not "://" in file:
                 if not os.path.exists(file):
                     raise ValueError('file "' + file + '" does not exist')
                 file = "file://" + os.path.abspath(file)
             self.opts.append(("file", file))
     if not addedopts["inputformat"]:
         addedopts["inputformat"] = ["auto"]
     inputformat_shortcuts = {
         "code": "org.apache.hadoop.streaming.AutoInputFormat",
         "text": "org.apache.hadoop.mapred.TextInputFormat",
         "sequencefile": "org.apache.hadoop.streaming.AutoInputFormat",
         "auto": "org.apache.hadoop.streaming.AutoInputFormat",
     }
     inputformat_shortcuts.update(configopts("inputformats", self.prog))
     inputformat = addedopts["inputformat"][0]
     if inputformat_shortcuts.has_key(inputformat.lower()):
         inputformat = inputformat_shortcuts[inputformat.lower()]
     self.opts.append(("inputformat", inputformat))
     if not addedopts["outputformat"]:
         addedopts["outputformat"] = ["sequencefile"]
     if addedopts["getpath"] and addedopts["getpath"] != "no":
         outputformat_shortcuts = {
             "code": "fm.last.feathers.output.MultipleSequenceFiles",
             "text": "fm.last.feathers.output.MultipleTextFiles",
             "raw": "fm.last.feathers.output.MultipleRawFileOutputFormat",
             "sequencefile": "fm.last.feathers.output.MultipleSequenceFiles",
         }
     else:
         outputformat_shortcuts = {
             "code": "org.apache.hadoop.mapred.SequenceFileOutputFormat",
             "text": "org.apache.hadoop.mapred.TextOutputFormat",
             "raw": "fm.last.feathers.output.RawFileOutputFormat",
             "sequencefile": "org.apache.hadoop.mapred.SequenceFileOutputFormat",
         }
     outputformat_shortcuts.update(configopts("outputformats", self.prog))
     outputformat = addedopts["outputformat"][0]
     if outputformat_shortcuts.has_key(outputformat.lower()):
         outputformat = outputformat_shortcuts[outputformat.lower()]
     self.opts.append(("outputformat", outputformat))
     if addedopts["addpath"] and addedopts["addpath"][0] != "no":
         self.opts.append(("cmdenv", "dumbo_addpath=true"))
     pyenv = envdef(
         "PYTHONPATH",
         addedopts["libegg"],
         "file",
         self.opts,
         shortcuts=dict(configopts("eggs", self.prog)),
         quote=False,
         trim=True,
         extrapaths=addedopts["pypath"],
     )
     if pyenv:
         self.opts.append(("cmdenv", pyenv))
     hadenv = envdef(
         "HADOOP_CLASSPATH", addedopts["libjar"], "libjar", self.opts, shortcuts=dict(configopts("jars", self.prog))
     )
     fileopt = getopt(self.opts, "file")
     if fileopt:
         tmpfiles = []
         for file in fileopt:
             if file.startswith("file://"):
                 self.opts.append(("file", file[7:]))
             else:
                 tmpfiles.append(file)
         if tmpfiles:
             self.opts.append(("jobconf", "tmpfiles=" + ",".join(tmpfiles)))
     libjaropt = getopt(self.opts, "libjar")
     if libjaropt:
         tmpjars = []
         for jar in libjaropt:
             if jar.startswith("file://"):
                 self.opts.append(("file", jar[7:]))
             else:
                 tmpjars.append(jar)
         if tmpjars:
             self.opts.append(("jobconf", "tmpjars=" + ",".join(tmpjars)))
     cmd = hadoop + "/bin/hadoop jar " + streamingjar
     retval = execute(cmd, self.opts, hadenv)
     if addedopts["delinputs"] and addedopts["delinputs"][0] == "yes":
         for (key, value) in self.opts:
             if key == "input":
                 if os.path.exists(hadoop + "/bin/hdfs"):
                     hdfs = hadoop + "/bin/hdfs"
                 else:
                     hdfs = hadoop + "/bin/hadoop"
                 execute("%s dfs -rmr '%s'" % (hdfs, value))
     return retval
Ejemplo n.º 7
0
    def run(self):
        addedopts = getopts(
            self.opts,
            [
                "fake",
                "debug",
                "python",
                "iteration",
                "itercount",
                "hadoop",
                "starter",
                "name",
                "memlimit",
                "param",
                "parser",
                "record",
                "joinkeys",
                "hadoopconf",
                "mapper",
                "reducer",
            ],
        )
        if addedopts["fake"] and addedopts["fake"][0] == "yes":

            def dummysystem(*args, **kwargs):
                return 0

            global system
            system = dummysystem  # not very clean, but it works...
        if addedopts["debug"] and addedopts["debug"][0] == "yes":
            self.opts.append(("cmdenv", "dumbo_debug=yes"))
        if not addedopts["python"]:
            python = "python"
        else:
            python = addedopts["python"][0]
        self.opts.append(("python", python))
        if not addedopts["iteration"]:
            iter = 0
        else:
            iter = int(addedopts["iteration"][0])
        if not addedopts["itercount"]:
            itercnt = 1
        else:
            itercnt = int(addedopts["itercount"][0])
        if addedopts["name"]:
            name = addedopts["name"][0]
        else:
            name = self.prog.split("/")[-1]
        self.opts.append(("name", "%s (%s/%s)" % (name, iter + 1, itercnt)))
        if not addedopts["hadoop"]:
            pypath = "/".join(self.prog.split("/")[:-1])
            if pypath:
                self.opts.append(("pypath", pypath))
        else:
            self.opts.append(("hadoop", addedopts["hadoop"][0]))
        progmod = self.prog.split("/")[-1]
        progmod = progmod[:-3] if progmod.endswith(".py") else progmod
        memlim = " 262144000"  # 250MB limit by default
        if addedopts["memlimit"]:
            # Limit amount of memory. This supports syntax
            # of the form '256m', '12g' etc.
            try:
                _memlim = int(addedopts["memlimit"][0][:-1])
                memlim = (
                    " %i"
                    % {"g": 1073741824 * _memlim, "m": 1048576 * _memlim, "k": 1024 * _memlim, "b": 1 * _memlim}[
                        addedopts["memlimit"][0][-1].lower()
                    ]
                )
            except KeyError:
                # Assume specified in bytes by default
                memlim = " " + addedopts["memlimit"][0]

        if addedopts["mapper"]:
            self.opts.append(("mapper", addedopts["mapper"][0]))
        else:
            self.opts.append(("mapper", "%s -m %s map %i%s" % (python, progmod, iter, memlim)))
        if addedopts["reducer"]:
            self.opts.append(("reducer", addedopts["reducer"][0]))
        else:
            self.opts.append(("reducer", "%s -m %s red %i%s" % (python, progmod, iter, memlim)))
        for param in addedopts["param"]:
            self.opts.append(("cmdenv", param))
        if addedopts["parser"] and iter == 0:
            parser = addedopts["parser"][0]
            shortcuts = dict(configopts("parsers", self.prog))
            if parser in shortcuts:
                parser = shortcuts[parser]
            self.opts.append(("cmdenv", "dumbo_parser=" + parser))
        if addedopts["record"] and iter == 0:
            record = addedopts["record"][0]
            shortcuts = dict(configopts("records", self.prog))
            if record in shortcuts:
                record = shortcuts[record]
            self.opts.append(("cmdenv", "dumbo_record=" + record))
        if addedopts["joinkeys"] and addedopts["joinkeys"][0] == "yes":
            self.opts.append(("cmdenv", "dumbo_joinkeys=yes"))
            self.opts.append(("partitioner", "org.apache.hadoop.mapred.lib.BinaryPartitioner"))
            self.opts.append(("jobconf", "mapred.binary.partitioner.right.offset=-6"))
        for hadoopconf in addedopts["hadoopconf"]:
            self.opts.append(("jobconf", hadoopconf))
        self.opts.append(("libegg", re.sub("\.egg.*$", ".egg", __file__)))
        return 0
Ejemplo n.º 8
0
 def run(self):
     retval = Iteration.run(self)
     if retval != 0:
         return retval
     if os.path.exists(self.prog):
         self.opts.append(('file', self.prog))
     addedopts = getopts(self.opts, ['hadoop',
                                     'name',
                                     'delinputs',
                                     'libegg',
                                     'libjar',
                                     'libjarstreaming',
                                     'inputformat',
                                     'outputformat',
                                     'nummaptasks',
                                     'numreducetasks',
                                     'priority',
                                     'queue',
                                     'cachefile',
                                     'cachearchive',
                                     'file',
                                     'codewritable',
                                     'addpath',
                                     'getpath',
                                     'python',
                                     'streamoutput',
                                     'pypath'])
     hadoop = findhadoop(addedopts['hadoop'][0])
     streamingjar = getopt(self.opts,'streamingjar')
     if streamingjar is None or len(streamingjar)==0:
         streamingjar = findjar(hadoop,'streaming')
     else:
         streamingjar = streamingjar[0]
     if not streamingjar:
         print >> sys.stderr, 'ERROR: Streaming jar not found'
         return 1
         
     # add typedbytes to path
     try: 
         import typedbytes
     except ImportError:
         print >> sys.stderr, 'ERROR: "typedbytes" module not found'
         return 1
     modpath = re.sub('\.egg.*$', '.egg', typedbytes.__file__)
     if modpath.endswith('.egg'):            
         addedopts['libegg'].append(modpath)    
     else:
         self.opts.append(('file', modpath)) 
         
     # add ctypedbytes to job
     try: 
         import ctypedbytes
         print >>sys.stderr, 'INFO: "ctypedbytes" found!'
         modpath = re.sub('\.egg.*$', '.egg', ctypedbytes.__file__)
         if modpath.endswith('.egg'):            
             addedopts['libegg'].append(modpath)
     except ImportError:
         pass        
         
         
     self.opts.append(('jobconf', 'stream.map.input=typedbytes'))
     self.opts.append(('jobconf', 'stream.reduce.input=typedbytes'))
     if addedopts['numreducetasks'] and addedopts['numreducetasks'][0] == '0':
         self.opts.append(('jobconf', 'stream.reduce.output=typedbytes'))
         if addedopts['streamoutput']:
             id_ = addedopts['streamoutput'][0]
             self.opts.append(('jobconf', 'stream.map.output=' + id_))
         else: 
             self.opts.append(('jobconf', 'stream.map.output=typedbytes'))
     else:
         self.opts.append(('jobconf', 'stream.map.output=typedbytes'))
         if addedopts['streamoutput']:
             id_ = addedopts['streamoutput'][0]
             self.opts.append(('jobconf', 'stream.reduce.output=' + id_))
         else:
             self.opts.append(('jobconf', 'stream.reduce.output=typedbytes'))
     if not addedopts['name']:
         self.opts.append(('jobconf', 'mapred.job.name='
                           + self.prog.split('/')[-1]))
     else:
         self.opts.append(('jobconf', 'mapred.job.name=%s'
                           % addedopts['name'][0]))
     if addedopts['nummaptasks']:
         self.opts.append(('jobconf', 'mapred.map.tasks=%s'
                           % addedopts['nummaptasks'][0]))
     if addedopts['numreducetasks']:
         numreducetasks = int(addedopts['numreducetasks'][0])
         self.opts.append(('numReduceTasks', str(numreducetasks)))
     if addedopts['priority']:
         self.opts.append(('jobconf', 'mapred.job.priority=%s'
                           % addedopts['priority'][0]))
     if addedopts['queue']:
         self.opts.append(('jobconf', 'mapred.job.queue.name=%s'
                           % addedopts['queue'][0]))
     if addedopts['cachefile']:
         for cachefile in addedopts['cachefile']:
             self.opts.append(('cacheFile', cachefile))
     if addedopts['cachearchive']:
         for cachearchive in addedopts['cachearchive']:
             self.opts.append(('cacheArchive', cachearchive))
     if addedopts['file']:
         for file in addedopts['file']:
             if not '://' in file:
                 if not os.path.exists(file):
                     raise ValueError('file "' + file + '" does not exist')
                 file = 'file://' + os.path.abspath(file)
             self.opts.append(('file', file))
     if not addedopts['inputformat']:
         addedopts['inputformat'] = ['auto']
     inputformat_shortcuts = \
         {'code': 'org.apache.hadoop.streaming.AutoInputFormat',
          'text': 'org.apache.hadoop.mapred.TextInputFormat',
          'sequencefile': 'org.apache.hadoop.streaming.AutoInputFormat',
          'auto': 'org.apache.hadoop.streaming.AutoInputFormat'}
     inputformat_shortcuts.update(configopts('inputformats', self.prog))
     inputformat = addedopts['inputformat'][0]
     if inputformat_shortcuts.has_key(inputformat.lower()):
         inputformat = inputformat_shortcuts[inputformat.lower()]
     self.opts.append(('inputformat', inputformat))
     if not addedopts['outputformat']:
         addedopts['outputformat'] = ['sequencefile']
     if addedopts['getpath'] and addedopts['getpath'] != 'no':
         outputformat_shortcuts = \
             {'code': 'fm.last.feathers.output.MultipleSequenceFiles',
              'text': 'fm.last.feathers.output.MultipleTextFiles',               
              'raw': 'fm.last.feathers.output.MultipleRawFileOutputFormat',
              'sequencefile': 'fm.last.feathers.output.MultipleSequenceFiles'}
     else:
         outputformat_shortcuts = \
             {'code': 'org.apache.hadoop.mapred.SequenceFileOutputFormat',
              'text': 'org.apache.hadoop.mapred.TextOutputFormat',
              'raw': 'fm.last.feathers.output.RawFileOutputFormat',
              'sequencefile': 'org.apache.hadoop.mapred.SequenceFileOutputFormat'}
     outputformat_shortcuts.update(configopts('outputformats', self.prog))
     outputformat = addedopts['outputformat'][0]
     if outputformat_shortcuts.has_key(outputformat.lower()):
         outputformat = outputformat_shortcuts[outputformat.lower()]
     self.opts.append(('outputformat', outputformat))
     if addedopts['addpath'] and addedopts['addpath'][0] != 'no':
         self.opts.append(('cmdenv', 'dumbo_addpath=true'))
     pyenv = envdef('PYTHONPATH',
                    addedopts['libegg'],
                    'file',
                    self.opts,
                    shortcuts=dict(configopts('eggs', self.prog)),
                    quote=False,
                    trim=True,
                    extrapaths=addedopts['pypath'])
     if pyenv:
         self.opts.append(('cmdenv', pyenv))
     if addedopts['libjarstreaming'] and addedopts['libjarstreaming'][0] != 'no':
         addedopts['libjar'].append(streamingjar)
     hadenv = envdef('HADOOP_CLASSPATH', addedopts['libjar'], 'libjar', 
                     self.opts, shortcuts=dict(configopts('jars', self.prog)))
     fileopt = getopt(self.opts, 'file')
     if fileopt:
         tmpfiles = []
         for file in fileopt:
             if file.startswith('file://'):
                 self.opts.append(('file', file[7:]))
             else:
                 tmpfiles.append(file)
         if tmpfiles:
             self.opts.append(('jobconf', 'tmpfiles=' + ','.join(tmpfiles)))
     libjaropt = getopt(self.opts, 'libjar')
     if libjaropt:
         tmpjars = []
         for jar in libjaropt:
             if jar.startswith('file://'):
                 self.opts.append(('file', jar[7:]))
             else:
                 tmpjars.append(jar)
         if tmpjars:
             self.opts.append(('jobconf', 'tmpjars=' + ','.join(tmpjars)))
     cmd = hadoop + '/bin/hadoop jar ' + streamingjar
     retval = execute(cmd, self.opts, hadenv)
     if addedopts['delinputs'] and addedopts['delinputs'][0] == 'yes':
         for (key, value) in self.opts:
             if key == 'input':
                 if os.path.exists(hadoop + "/bin/hdfs"):
                     hdfs = hadoop + "/bin/hdfs"
                 else:
                     hdfs = hadoop + "/bin/hadoop"
                 execute("%s dfs -rmr '%s'" % (hdfs, value))
     return retval
Ejemplo n.º 9
0
    def run(self):
        retval = Iteration.run(self)
        if retval != 0:
            return retval
        addedopts = getopts(self.opts, ['input',
                                        'output',
                                        'mapper',
                                        'reducer',
                                        'libegg',
                                        'delinputs',
                                        'cmdenv',
                                        'pv',
                                        'addpath',
                                        'inputformat',
                                        'outputformat',
                                        'numreducetasks',
                                        'python',
                                        'pypath',
                                        'sorttmpdir',
                                        'sortbufsize'])
        (mapper, reducer) = (addedopts['mapper'][0], addedopts['reducer'][0])
        if not addedopts['input'] or not addedopts['output']:
            print >> sys.stderr, 'ERROR: input or output not specified'
            return 1
        inputs = reduce(operator.concat, (input.split(' ') for input in
                        addedopts['input']))
        output = addedopts['output'][0]
        pyenv = envdef('PYTHONPATH', addedopts['libegg'],
                       shortcuts=dict(configopts('eggs', self.prog)),
                       extrapaths=addedopts['pypath'])
        cmdenv = ' '.join("%s='%s'" % tuple(arg.split('=')) for arg in
                          addedopts['cmdenv'])
        if addedopts['pv'] and addedopts['pv'][0] == 'yes':
            mpv = '| pv -s `du -b %s | cut -f 1` -cN map ' % ' '.join(inputs)
            (spv, rpv) = ('| pv -cN sort ', '| pv -cN reduce ')
        else:
            (mpv, spv, rpv) = ('', '', '')

        (sorttmpdir, sortbufsize) = ('', '')
        if addedopts['sorttmpdir']:
            sorttmpdir = "-T %s" % addedopts['sorttmpdir'][0]
        if addedopts['sortbufsize']:
            sortbufsize = "-S %s" % addedopts['sortbufsize'][0]

        python = addedopts['python'][0]
        encodepipe = pyenv + ' ' + python + \
                     ' -m dumbo.cmd encodepipe -file ' + ' -file '.join(inputs)
        if addedopts['inputformat'] and addedopts['inputformat'][0] == 'code':
            encodepipe += ' -alreadycoded yes'
        if addedopts['addpath'] and addedopts['addpath'][0] != 'no':
            encodepipe += ' -addpath yes'
        if addedopts['numreducetasks'] and addedopts['numreducetasks'][0] == '0':
            retval = execute("%s | %s %s %s %s > '%s'" % (encodepipe,
                                                          pyenv,
                                                          cmdenv,
                                                          mapper,
                                                          mpv,
                                                          output))
        else:
            retval = execute("%s | %s %s %s %s| LC_ALL=C sort %s %s %s| %s %s %s %s> '%s'"
                             % (encodepipe,
                                pyenv,
                                cmdenv,
                                mapper,
                                mpv,
                                sorttmpdir,
                                sortbufsize,
                                spv,
                                pyenv,
                                cmdenv,
                                reducer,
                                rpv,
                                output))
        if addedopts['delinputs'] and addedopts['delinputs'][0] == 'yes':
            for file in addedopts['input']:
                execute('rm ' + file)
        return retval
Ejemplo n.º 10
0
    def run(self):
        addedopts = getopts(self.opts, [
            'fake', 'debug', 'python', 'iteration', 'itercount', 'hadoop',
            'starter', 'name', 'memlimit', 'param', 'parser', 'record',
            'joinkeys', 'hadoopconf', 'mapper', 'reducer'
        ])
        if addedopts['fake'] and addedopts['fake'][0] == 'yes':

            def dummysystem(*args, **kwargs):
                return 0

            global system
            system = dummysystem  # not very clean, but it works...
        if addedopts['debug'] and addedopts['debug'][0] == 'yes':
            self.opts.append(('cmdenv', 'dumbo_debug=yes'))
        if not addedopts['python']:
            python = 'python'
        else:
            python = addedopts['python'][0]
        self.opts.append(('python', python))
        if not addedopts['iteration']:
            iter = 0
        else:
            iter = int(addedopts['iteration'][0])
        if not addedopts['itercount']:
            itercnt = 1
        else:
            itercnt = int(addedopts['itercount'][0])
        if addedopts['name']:
            name = addedopts['name'][0]
        else:
            name = self.prog.split('/')[-1]
        self.opts.append(('name', '%s (%s/%s)' % (name, iter + 1, itercnt)))
        if not addedopts['hadoop']:
            pypath = '/'.join(self.prog.split('/')[:-1])
            if pypath: self.opts.append(('pypath', pypath))
        else:
            self.opts.append(('hadoop', addedopts['hadoop'][0]))
        progmod = self.prog.split('/')[-1]
        progmod = progmod[:-3] if progmod.endswith('.py') else progmod
        memlim = ' 262144000'  # 250MB limit by default
        if addedopts['memlimit']:
            # Limit amount of memory. This supports syntax
            # of the form '256m', '12g' etc.
            try:
                _memlim = int(addedopts['memlimit'][0][:-1])
                memlim = ' %i' % {
                    'g': 1073741824 * _memlim,
                    'm': 1048576 * _memlim,
                    'k': 1024 * _memlim,
                    'b': 1 * _memlim,
                }[addedopts['memlimit'][0][-1].lower()]
            except KeyError:
                # Assume specified in bytes by default
                memlim = ' ' + addedopts['memlimit'][0]

        if addedopts['mapper']:
            self.opts.append(('mapper', addedopts['mapper'][0]))
        else:
            self.opts.append(
                ('mapper',
                 '%s -m %s map %i%s' % (python, progmod, iter, memlim)))
        if addedopts['reducer']:
            self.opts.append(('reducer', addedopts['reducer'][0]))
        else:
            self.opts.append(
                ('reducer',
                 '%s -m %s red %i%s' % (python, progmod, iter, memlim)))
        for param in addedopts['param']:
            self.opts.append(('cmdenv', param))
        if addedopts['parser'] and iter == 0:
            parser = addedopts['parser'][0]
            shortcuts = dict(configopts('parsers', self.prog))
            if parser in shortcuts:
                parser = shortcuts[parser]
            self.opts.append(('cmdenv', 'dumbo_parser=' + parser))
        if addedopts['record'] and iter == 0:
            record = addedopts['record'][0]
            shortcuts = dict(configopts('records', self.prog))
            if record in shortcuts:
                record = shortcuts[record]
            self.opts.append(('cmdenv', 'dumbo_record=' + record))
        if addedopts['joinkeys'] and addedopts['joinkeys'][0] == 'yes':
            self.opts.append(('cmdenv', 'dumbo_joinkeys=yes'))
            self.opts.append(
                ('partitioner',
                 'org.apache.hadoop.mapred.lib.BinaryPartitioner'))
            self.opts.append(
                ('jobconf', 'mapred.binary.partitioner.right.offset=-6'))
        for hadoopconf in addedopts['hadoopconf']:
            self.opts.append(('jobconf', hadoopconf))
        self.opts.append(('libegg', re.sub('\.egg.*$', '.egg', __file__)))
        return 0
Ejemplo n.º 11
0
Archivo: punix.py Proyecto: jso/dumbo
    def run(self):
        retval = Iteration.run(self)
        if retval != 0:
            return retval
        addedopts = getopts(self.opts, ['input',
                                        'output',
                                        'mapper',
                                        'reducer',
                                        'libegg',
                                        'delinputs',
                                        'cmdenv',
                                        'inputformat',
                                        'outputformat',
                                        'numreducetasks',
                                        'python',
                                        'pypath',
                                        'tmpdir',
                                        'nmappers',
                                        'nreducers',
                                        'permapper',
                                        'shell'])
        (mapper, reducer) = (addedopts['mapper'][0], addedopts['reducer'][0])
        if not addedopts['input'] or not addedopts['output']:
            print >> sys.stderr, 'ERROR: input or output not specified'
            return 1
        inputs = reduce(operator.concat, (input.split(' ') for input in
                        addedopts['input']))
        output = addedopts['output'][0]
        try: os.makedirs(output)
        except os.error as e: pass

        pyenv = envdef('PYTHONPATH', addedopts['libegg'],
                       shortcuts=dict(configopts('eggs', self.prog)),
                       extrapaths=addedopts['pypath'])
        cmdenv = ' '.join("%s='%s'" % tuple(arg.split('=')) for arg in
                          addedopts['cmdenv'])

        shell = addedopts["shell"][0]

        python = addedopts['python'][0]

        mapTotal = len(inputs)
        mapDoneCount = [0]
        reduceDoneCount = [0]

        nMappers = int(addedopts["nmappers"][0])
        nReducers = int(addedopts["nreducers"][0])

        # this is the number of files that will be handed to each mapper
        permapper = int(addedopts["permapper"][0])

        # start the mappers, reducers
        mPool = Pool(nMappers)
        rPool = Pool(nReducers)

        doReduces = not (addedopts['numreducetasks'] and addedopts['numreducetasks'][0] == '0')

        # set up the mapper output/reducer input directories
        tmpdir = os.sep.join([addedopts['tmpdir'][0], "%s_%06d" % (time.strftime("%Y-%m-%d_%H-%M-%S", time.gmtime()), random.randint(0, 999999))])

        mLock = threading.Lock()
        mResults = {}
        rLock = threading.Lock()
        mByR = {}
        rStarted = set()
        rResults = {}

        # start the map status output copier
        mapErrOutputCopier = MapErrOutputCopier(tmpdir)
        mapErrOutputCopier.start()

        if doReduces:
            # start the copy threads to handle map outputs
            copyLock = threading.Lock()
            copyThreads = {}
            for i in range(nReducers):
                copyThreads[i] = ReduceInputCopier(tmpdir, i)
                copyThreads[i].start()

            # start the reduce status output copier
            reduceErrOutputCopier = ReduceErrOutputCopier(tmpdir)
            reduceErrOutputCopier.start()

            for i in range(nReducers):
                try: os.makedirs(os.sep.join([tmpdir, "r-%d" % i]))
                except os.error as e: pass

                mByR[i] = set()

        # do maps -- kick it all off
        if permapper == 1:
            for args in enumerate(inputs):
                i, filename = args
                args = pyenv, python, cmdenv, mapper, nReducers, tmpdir, output, addedopts, shell, i, [filename], doReduces
                mLock.acquire()
                mResults[i] = mPool.apply_async(doMap, args)
                mLock.release()
        else:
            # multiple files per mapper...
            remaining = list(inputs)
            i = 0
            while remaining:
                args = pyenv, python, cmdenv, mapper, nReducers, tmpdir, output, addedopts, shell, i, remaining[:permapper], doReduces
                mLock.acquire()
                mResults[i] = mPool.apply_async(doMap, args)
                mLock.release()

                remaining = remaining[permapper:]
                i += 1
            
            # need to reset the mapTotal variable since we have fewer tasks...
            mapTotal = i

        def reduceDone():
            # did anything finish?
            rLock.acquire()

            done = [x for x in rResults if rResults[x].ready()]
            for args in done: del rResults[args] # cleanup

            rLock.release()

            for reducenum in done:
                #print "reduce %d done" % reducenum
                reduceDoneCount[0] += 1

                reduceErrOutputCopier.reduce_done(reducenum)

        def mapDone():
            # did anything finish?
            mLock.acquire()

            done = [x for x in mResults if mResults[x].ready()]
            for args in done: del mResults[args] # cleanup

            mLock.release()

            for args in done:
                i = args

                mapDoneCount[0] += 1

                mapErrOutputCopier.map_done(i)

                if doReduces:
                    #print "map %d done" % i

                    # update the structures
                    for reducenum in range(nReducers):
                        # initiate the copy request...
                        copyThreads[reducenum].map_done(i)

                        rLock.acquire()

                        mByR[reducenum].add(i)

                        # see if we can signal that's all the copier will have to handle?
                        if len(mByR[reducenum]) == mapTotal:
                            copyThreads[reducenum].map_done(None)

                        rLock.release()
                else:
                    # just move the map output file (unsorted) to the output directory
                    print "map %d done" % i


        def copyDone():
            # did anything finish?
            copyLock.acquire()

            done = [x for x in copyThreads if not copyThreads[x].is_alive()]

            for rnum in done: del copyThreads[rnum] # cleanup

            copyLock.release()

            for rnum in done:
                rLock.acquire()

                rStarted.add(rnum)
                args = tmpdir, pyenv, cmdenv, reducer, output, shell, rnum
                rResults[rnum] = rPool.apply_async(doReduce, args)

                rLock.release()

        while reduceDoneCount[0] < nReducers:
            # check for things finishing...
            mapDone()
            copyDone()
            reduceDone()

            mLock.acquire()
            haveMaps = len(mResults)
            mLock.release()

            rLock.acquire()
            haveReduces = len(rResults)
            rLock.release()

            copyLock.acquire()
            copyRunning = len(copyThreads)
            copyLock.release()

            print "%d/%d/%d maps\t%d/%d copies\t%d/%d/%d reduces" % (haveMaps, mapDoneCount[0], mapTotal, copyRunning, nReducers, haveReduces, reduceDoneCount[0], nReducers)

            time.sleep(5)

        mPool.terminate()
        mPool.join()
        rPool.terminate()
        rPool.join()

        # make sure the map status output is done before cleaning up the tmp dir
        mapErrOutputCopier.map_done(None)
        mapErrOutputCopier.join()

        if doReduces:
            # make sure the reduce status output is done before cleaning up the tmp dir
            reduceErrOutputCopier.reduce_done(None)
            reduceErrOutputCopier.join()

        if not master_debug and len(os.listdir(tmpdir)) == 0:
            os.rmdir(tmpdir)


        return 0 # make sure we return an error if there is a problem.
Ejemplo n.º 12
0
 def run(self):
     retval = Iteration.run(self)
     if retval != 0:
         return retval
     if os.path.exists(self.prog):
         self.opts.append(('file', self.prog))
     addedopts = getopts(self.opts, ['hadoop',
                                     'name',
                                     'delinputs',
                                     'libegg',
                                     'libjar',
                                     'inputformat',
                                     'outputformat',
                                     'nummaptasks',
                                     'numreducetasks',
                                     'priority',
                                     'queue',
                                     'cachefile',
                                     'cachearchive',
                                     'file',
                                     'codewritable',
                                     'addpath',
                                     'getpath',
                                     'python',
                                     'streamoutput',
                                     'pypath'])
     hadoop = findhadoop(addedopts['hadoop'][0])
     streamingjar = findjar(hadoop, 'streaming')
     if not streamingjar:
         print >> sys.stderr, 'ERROR: Streaming jar not found'
         return 1
     try: 
         import typedbytes
     except ImportError:
         print >> sys.stderr, 'ERROR: "typedbytes" module not found'
         return 1
     modpath = re.sub('\.egg.*$', '.egg', typedbytes.__file__)
     if modpath.endswith('.egg'):            
         addedopts['libegg'].append(modpath)    
     else:
         self.opts.append(('file', modpath)) 
     self.opts.append(('jobconf', 'stream.map.input=typedbytes'))
     self.opts.append(('jobconf', 'stream.reduce.input=typedbytes'))
     if addedopts['numreducetasks'] and addedopts['numreducetasks'][0] == '0':
         self.opts.append(('jobconf', 'stream.reduce.output=typedbytes'))
         if addedopts['streamoutput']:
             id_ = addedopts['streamoutput'][0]
             self.opts.append(('jobconf', 'stream.map.output=' + id_))
         else: 
             self.opts.append(('jobconf', 'stream.map.output=typedbytes'))
     else:
         self.opts.append(('jobconf', 'stream.map.output=typedbytes'))
         if addedopts['streamoutput']:
             id_ = addedopts['streamoutput'][0]
             self.opts.append(('jobconf', 'stream.reduce.output=' + id_))
         else:
             self.opts.append(('jobconf', 'stream.reduce.output=typedbytes'))
     if not addedopts['name']:
         self.opts.append(('jobconf', 'mapred.job.name='
                           + self.prog.split('/')[-1]))
     else:
         self.opts.append(('jobconf', 'mapred.job.name=%s'
                           % addedopts['name'][0]))
     if addedopts['nummaptasks']:
         self.opts.append(('jobconf', 'mapred.map.tasks=%s'
                           % addedopts['nummaptasks'][0]))
     if addedopts['numreducetasks']:
         numreducetasks = int(addedopts['numreducetasks'][0])
         self.opts.append(('numReduceTasks', str(numreducetasks)))
     if addedopts['priority']:
         self.opts.append(('jobconf', 'mapred.job.priority=%s'
                           % addedopts['priority'][0]))
     if addedopts['queue']:
         self.opts.append(('jobconf', 'mapred.job.queue.name=%s'
                           % addedopts['queue'][0]))
     if addedopts['cachefile']:
         for cachefile in addedopts['cachefile']:
             self.opts.append(('cacheFile', cachefile))
     if addedopts['cachearchive']:
         for cachearchive in addedopts['cachearchive']:
             self.opts.append(('cacheArchive', cachearchive))
     if addedopts['file']:
         for file in addedopts['file']:
             if not '://' in file:
                 if not os.path.exists(file):
                     raise ValueError('file "' + file + '" does not exist')
                 file = 'file://' + os.path.abspath(file)
             self.opts.append(('file', file))
     if not addedopts['inputformat']:
         addedopts['inputformat'] = ['auto']
     inputformat_shortcuts = \
         {'code': 'org.apache.hadoop.streaming.AutoInputFormat',
          'text': 'org.apache.hadoop.mapred.TextInputFormat',
          'sequencefile': 'org.apache.hadoop.streaming.AutoInputFormat',
          'auto': 'org.apache.hadoop.streaming.AutoInputFormat'}
     inputformat_shortcuts.update(configopts('inputformats', self.prog))
     inputformat = addedopts['inputformat'][0]
     if inputformat_shortcuts.has_key(inputformat.lower()):
         inputformat = inputformat_shortcuts[inputformat.lower()]
     self.opts.append(('inputformat', inputformat))
     if not addedopts['outputformat']:
         addedopts['outputformat'] = ['sequencefile']
     if addedopts['getpath'] and addedopts['getpath'] != 'no':
         outputformat_shortcuts = \
             {'code': 'fm.last.feathers.output.MultipleSequenceFiles',
              'text': 'fm.last.feathers.output.MultipleTextFiles',               
              'raw': 'fm.last.feathers.output.MultipleRawFileOutputFormat',
              'sequencefile': 'fm.last.feathers.output.MultipleSequenceFiles'}
     else:
         outputformat_shortcuts = \
             {'code': 'org.apache.hadoop.mapred.SequenceFileOutputFormat',
              'text': 'org.apache.hadoop.mapred.TextOutputFormat',
              'raw': 'fm.last.feathers.output.RawFileOutputFormat',
              'sequencefile': 'org.apache.hadoop.mapred.SequenceFileOutputFormat'}
     outputformat_shortcuts.update(configopts('outputformats', self.prog))
     outputformat = addedopts['outputformat'][0]
     if outputformat_shortcuts.has_key(outputformat.lower()):
         outputformat = outputformat_shortcuts[outputformat.lower()]
     self.opts.append(('outputformat', outputformat))
     if addedopts['addpath'] and addedopts['addpath'][0] != 'no':
         self.opts.append(('cmdenv', 'dumbo_addpath=true'))
     pyenv = envdef('PYTHONPATH',
                    addedopts['libegg'],
                    'file',
                    self.opts,
                    shortcuts=dict(configopts('eggs', self.prog)),
                    quote=False,
                    trim=True,
                    extrapaths=addedopts['pypath'])
     if pyenv:
         self.opts.append(('cmdenv', pyenv))
     hadenv = envdef('HADOOP_CLASSPATH', addedopts['libjar'], 'libjar', 
                     self.opts, shortcuts=dict(configopts('jars', self.prog)))
     fileopt = getopt(self.opts, 'file')
     if fileopt:
         tmpfiles = []
         for file in fileopt:
             if file.startswith('file://'):
                 self.opts.append(('file', file[7:]))
             else:
                 tmpfiles.append(file)
         if tmpfiles:
             self.opts.append(('jobconf', 'tmpfiles=' + ','.join(tmpfiles)))
     libjaropt = getopt(self.opts, 'libjar')
     if libjaropt:
         tmpjars = []
         for jar in libjaropt:
             if jar.startswith('file://'):
                 self.opts.append(('file', jar[7:]))
             else:
                 tmpjars.append(jar)
         if tmpjars:
             self.opts.append(('jobconf', 'tmpjars=' + ','.join(tmpjars)))
     cmd = hadoop + '/bin/hadoop jar ' + streamingjar
     retval = execute(cmd, self.opts, hadenv)
     if addedopts['delinputs'] and addedopts['delinputs'][0] == 'yes':
         for (key, value) in self.opts:
             if key == 'input':
                 if os.path.exists(hadoop + "/bin/hdfs"):
                     hdfs = hadoop + "/bin/hdfs"
                 else:
                     hdfs = hadoop + "/bin/hadoop"
                 execute("%s dfs -rmr '%s'" % (hdfs, value))
     return retval