def run(self): retval = Iteration.run(self) if retval != 0: return retval addedopts = getopts(self.opts, [ 'input', 'output', 'mapper', 'reducer', 'libegg', 'delinputs', 'cmdenv', 'pv', 'addpath', 'inputformat', 'outputformat', 'numreducetasks', 'python', 'pypath', 'sorttmpdir', 'sortbufsize' ]) (mapper, reducer) = (addedopts['mapper'][0], addedopts['reducer'][0]) if not addedopts['input'] or not addedopts['output']: print >> sys.stderr, 'ERROR: input or output not specified' return 1 inputs = reduce(operator.concat, (input.split(' ') for input in addedopts['input'])) output = addedopts['output'][0] pyenv = envdef('PYTHONPATH', addedopts['libegg'], shortcuts=dict(configopts('eggs', self.prog)), extrapaths=addedopts['pypath']) cmdenv = ' '.join("%s='%s'" % tuple(arg.split('=')) for arg in addedopts['cmdenv']) if addedopts['pv'] and addedopts['pv'][0] == 'yes': mpv = '| pv -s `du -b %s | cut -f 1` -cN map ' % ' '.join(inputs) (spv, rpv) = ('| pv -cN sort ', '| pv -cN reduce ') else: (mpv, spv, rpv) = ('', '', '') (sorttmpdir, sortbufsize) = ('', '') if addedopts['sorttmpdir']: sorttmpdir = "-T %s" % addedopts['sorttmpdir'][0] if addedopts['sortbufsize']: sortbufsize = "-S %s" % addedopts['sortbufsize'][0] python = addedopts['python'][0] encodepipe = pyenv + ' ' + python + \ ' -m dumbo.cmd encodepipe -file ' + ' -file '.join(inputs) if addedopts['inputformat'] and addedopts['inputformat'][0] == 'code': encodepipe += ' -alreadycoded yes' if addedopts['addpath'] and addedopts['addpath'][0] != 'no': encodepipe += ' -addpath yes' if addedopts['numreducetasks'] and addedopts['numreducetasks'][ 0] == '0': retval = execute("%s | %s %s %s %s > '%s'" % (encodepipe, pyenv, cmdenv, mapper, mpv, output)) else: retval = execute( "%s | %s %s %s %s| LC_ALL=C sort %s %s %s| %s %s %s %s> '%s'" % (encodepipe, pyenv, cmdenv, mapper, mpv, sorttmpdir, sortbufsize, spv, pyenv, cmdenv, reducer, rpv, output)) if addedopts['delinputs'] and addedopts['delinputs'][0] == 'yes': for file in addedopts['input']: execute('rm ' + file) return retval
def run(self): retval = Iteration.run(self) if retval != 0: return retval if os.path.exists(self.prog): self.opts.append(('file', self.prog)) addedopts = getopts(self.opts, ['hadoop', 'name', 'delinputs', 'libegg', 'libjar', 'libjarstreaming', 'inputformat', 'outputformat', 'nummaptasks', 'numreducetasks', 'priority', 'queue', 'cachefile', 'cachearchive', 'file', 'codewritable', 'addpath', 'getpath', 'python', 'streamoutput', 'pypath']) hadoop = findhadoop(addedopts['hadoop'][0]) streamingjar = getopt(self.opts,'streamingjar') if streamingjar is None or len(streamingjar)==0: streamingjar = findjar(hadoop,'streaming') else: streamingjar = streamingjar[0] if not streamingjar: print >> sys.stderr, 'ERROR: Streaming jar not found' return 1 # add typedbytes to path try: import typedbytes except ImportError: print >> sys.stderr, 'ERROR: "typedbytes" module not found' return 1 modpath = re.sub('\.egg.*$', '.egg', typedbytes.__file__) if modpath.endswith('.egg'): addedopts['libegg'].append(modpath) else: self.opts.append(('file', modpath)) # add ctypedbytes to job try: import ctypedbytes print >>sys.stderr, 'INFO: "ctypedbytes" found!' modpath = re.sub('\.egg.*$', '.egg', ctypedbytes.__file__) if modpath.endswith('.egg'): addedopts['libegg'].append(modpath) except ImportError: pass self.opts.append(('jobconf', 'stream.map.input=typedbytes')) self.opts.append(('jobconf', 'stream.reduce.input=typedbytes')) if addedopts['numreducetasks'] and addedopts['numreducetasks'][0] == '0': self.opts.append(('jobconf', 'stream.reduce.output=typedbytes')) if addedopts['streamoutput']: id_ = addedopts['streamoutput'][0] self.opts.append(('jobconf', 'stream.map.output=' + id_)) else: self.opts.append(('jobconf', 'stream.map.output=typedbytes')) else: self.opts.append(('jobconf', 'stream.map.output=typedbytes')) if addedopts['streamoutput']: id_ = addedopts['streamoutput'][0] self.opts.append(('jobconf', 'stream.reduce.output=' + id_)) else: self.opts.append(('jobconf', 'stream.reduce.output=typedbytes')) if not addedopts['name']: self.opts.append(('jobconf', 'mapred.job.name=' + self.prog.split('/')[-1])) else: self.opts.append(('jobconf', 'mapred.job.name=%s' % addedopts['name'][0])) if addedopts['nummaptasks']: self.opts.append(('jobconf', 'mapred.map.tasks=%s' % addedopts['nummaptasks'][0])) if addedopts['numreducetasks']: numreducetasks = int(addedopts['numreducetasks'][0]) self.opts.append(('numReduceTasks', str(numreducetasks))) if addedopts['priority']: self.opts.append(('jobconf', 'mapred.job.priority=%s' % addedopts['priority'][0])) if addedopts['queue']: self.opts.append(('jobconf', 'mapred.job.queue.name=%s' % addedopts['queue'][0])) if addedopts['cachefile']: for cachefile in addedopts['cachefile']: self.opts.append(('cacheFile', cachefile)) if addedopts['cachearchive']: for cachearchive in addedopts['cachearchive']: self.opts.append(('cacheArchive', cachearchive)) if addedopts['file']: for file in addedopts['file']: if not '://' in file: if not os.path.exists(file): raise ValueError('file "' + file + '" does not exist') file = 'file://' + os.path.abspath(file) self.opts.append(('file', file)) if not addedopts['inputformat']: addedopts['inputformat'] = ['auto'] inputformat_shortcuts = \ {'code': 'org.apache.hadoop.streaming.AutoInputFormat', 'text': 'org.apache.hadoop.mapred.TextInputFormat', 'sequencefile': 'org.apache.hadoop.streaming.AutoInputFormat', 'auto': 'org.apache.hadoop.streaming.AutoInputFormat'} inputformat_shortcuts.update(configopts('inputformats', self.prog)) inputformat = addedopts['inputformat'][0] if inputformat_shortcuts.has_key(inputformat.lower()): inputformat = inputformat_shortcuts[inputformat.lower()] self.opts.append(('inputformat', inputformat)) if not addedopts['outputformat']: addedopts['outputformat'] = ['sequencefile'] if addedopts['getpath'] and addedopts['getpath'] != 'no': outputformat_shortcuts = \ {'code': 'fm.last.feathers.output.MultipleSequenceFiles', 'text': 'fm.last.feathers.output.MultipleTextFiles', 'raw': 'fm.last.feathers.output.MultipleRawFileOutputFormat', 'sequencefile': 'fm.last.feathers.output.MultipleSequenceFiles'} else: outputformat_shortcuts = \ {'code': 'org.apache.hadoop.mapred.SequenceFileOutputFormat', 'text': 'org.apache.hadoop.mapred.TextOutputFormat', 'raw': 'fm.last.feathers.output.RawFileOutputFormat', 'sequencefile': 'org.apache.hadoop.mapred.SequenceFileOutputFormat'} outputformat_shortcuts.update(configopts('outputformats', self.prog)) outputformat = addedopts['outputformat'][0] if outputformat_shortcuts.has_key(outputformat.lower()): outputformat = outputformat_shortcuts[outputformat.lower()] self.opts.append(('outputformat', outputformat)) if addedopts['addpath'] and addedopts['addpath'][0] != 'no': self.opts.append(('cmdenv', 'dumbo_addpath=true')) pyenv = envdef('PYTHONPATH', addedopts['libegg'], 'file', self.opts, shortcuts=dict(configopts('eggs', self.prog)), quote=False, trim=True, extrapaths=addedopts['pypath']) if pyenv: self.opts.append(('cmdenv', pyenv)) if addedopts['libjarstreaming'] and addedopts['libjarstreaming'][0] != 'no': addedopts['libjar'].append(streamingjar) hadenv = envdef('HADOOP_CLASSPATH', addedopts['libjar'], 'libjar', self.opts, shortcuts=dict(configopts('jars', self.prog))) fileopt = getopt(self.opts, 'file') if fileopt: tmpfiles = [] for file in fileopt: if file.startswith('file://'): self.opts.append(('file', file[7:])) else: tmpfiles.append(file) if tmpfiles: self.opts.append(('jobconf', 'tmpfiles=' + ','.join(tmpfiles))) libjaropt = getopt(self.opts, 'libjar') if libjaropt: tmpjars = [] for jar in libjaropt: if jar.startswith('file://'): self.opts.append(('file', jar[7:])) else: tmpjars.append(jar) if tmpjars: self.opts.append(('jobconf', 'tmpjars=' + ','.join(tmpjars))) cmd = hadoop + '/bin/hadoop jar ' + streamingjar retval = execute(cmd, self.opts, hadenv) if addedopts['delinputs'] and addedopts['delinputs'][0] == 'yes': for (key, value) in self.opts: if key == 'input': if os.path.exists(hadoop + "/bin/hdfs"): hdfs = hadoop + "/bin/hdfs" else: hdfs = hadoop + "/bin/hadoop" execute("%s dfs -rmr '%s'" % (hdfs, value)) return retval
def __init__(self, prog, opts): Iteration.__init__(self, prog, opts) self.opts += configopts('streaming', prog, self.opts) hadoop = getopt(self.opts, 'hadoop', delete=False)[0] self.opts += configopts('streaming_' + hadoop, prog, self.opts)
def run(self): retval = Iteration.run(self) if retval != 0: return retval opts = self.opts if os.path.exists(self.prog): opts.add('file', self.prog) keys = [ 'hadoop', 'name', 'delinputs', 'libegg', 'libjar', 'inputformat', 'outputformat', 'nummaptasks', 'numreducetasks', 'priority', 'queue', 'cachefile', 'cachearchive', 'file', 'codewritable', 'addpath', 'getpath', 'python', 'streamoutput', 'pypath', 'hadooplib' ] addedopts = opts.filter(keys) opts.remove(*keys) hadoop = findhadoop(addedopts['hadoop'][0]) streamingjar = findjar(hadoop, 'streaming', addedopts['hadooplib']) if not streamingjar: print >> sys.stderr, 'ERROR: Streaming jar not found' return 1 try: import typedbytes except ImportError: print >> sys.stderr, 'ERROR: "typedbytes" module not found' return 1 modpath = re.sub('\.egg.*$', '.egg', typedbytes.__file__) if modpath.endswith('.egg'): addedopts.add('libegg', modpath) else: opts.add('file', modpath) opts.add('jobconf', 'stream.map.input=typedbytes') opts.add('jobconf', 'stream.reduce.input=typedbytes') if addedopts['numreducetasks'] and addedopts['numreducetasks'][ 0] == '0': opts.add('jobconf', 'stream.reduce.output=typedbytes') if addedopts['streamoutput']: id_ = addedopts['streamoutput'][0] opts.add('jobconf', 'stream.map.output=' + id_) else: opts.add('jobconf', 'stream.map.output=typedbytes') else: opts.add('jobconf', 'stream.map.output=typedbytes') if addedopts['streamoutput']: id_ = addedopts['streamoutput'][0] opts.add('jobconf', 'stream.reduce.output=' + id_) else: opts.add('jobconf', 'stream.reduce.output=typedbytes') progname = self.prog.split('/')[-1] if not addedopts['name'] \ else addedopts['name'][0] opts.add('jobconf', 'mapred.job.name=%s' % progname) nummaptasks = addedopts['nummaptasks'] numreducetasks = addedopts['numreducetasks'] if nummaptasks: opts.add('jobconf', 'mapred.map.tasks=%s' % nummaptasks[0]) if numreducetasks: opts.add('numReduceTasks', numreducetasks[0]) if addedopts['priority']: opts.add('jobconf', 'mapred.job.priority=%s' % addedopts['priority'][0]) if addedopts['queue']: opts.add('jobconf', 'mapred.job.queue.name=%s' % addedopts['queue'][0]) for cachefile in addedopts['cachefile']: opts.add('cacheFile', cachefile) for cachearchive in addedopts['cachearchive']: opts.add('cacheArchive', cachearchive) for _file in addedopts['file']: if not '://' in _file: if not os.path.exists(_file): raise ValueError('file "%s" does not exist' % _file) _file = 'file://%s' % os.path.abspath(_file) opts.add('file', _file) if not addedopts['inputformat']: addedopts.add('inputformat', 'auto') inputformat_shortcuts = { 'code': 'org.apache.hadoop.streaming.AutoInputFormat', 'text': 'org.apache.hadoop.mapred.TextInputFormat', 'sequencefile': 'org.apache.hadoop.streaming.AutoInputFormat', 'auto': 'org.apache.hadoop.streaming.AutoInputFormat' } inputformat_shortcuts.update(configopts('inputformats', self.prog)) inputformat = addedopts['inputformat'][0] if inputformat.lower() in inputformat_shortcuts: inputformat = inputformat_shortcuts[inputformat.lower()] opts.add('inputformat', inputformat) if not addedopts['outputformat']: addedopts.add('outputformat', 'sequencefile') if addedopts['getpath'] and 'no' not in addedopts['getpath']: outputformat_shortcuts = { 'code': 'fm.last.feathers.output.MultipleSequenceFiles', 'text': 'fm.last.feathers.output.MultipleTextFiles', 'raw': 'fm.last.feathers.output.MultipleRawFileOutputFormat', 'sequencefile': 'fm.last.feathers.output.MultipleSequenceFiles' } else: outputformat_shortcuts = { 'code': 'org.apache.hadoop.mapred.SequenceFileOutputFormat', 'text': 'org.apache.hadoop.mapred.TextOutputFormat', 'raw': 'fm.last.feathers.output.RawFileOutputFormat', 'sequencefile': 'org.apache.hadoop.mapred.SequenceFileOutputFormat' } outputformat_shortcuts.update(configopts('outputformats', self.prog)) outputformat = addedopts['outputformat'][0] if outputformat.lower() in outputformat_shortcuts: outputformat = outputformat_shortcuts[outputformat.lower()] opts.add('outputformat', outputformat) if addedopts['addpath'] and 'no' not in addedopts['addpath']: opts.add('cmdenv', 'dumbo_addpath=true') pyenv = envdef('PYTHONPATH', addedopts['libegg'], 'file', self.opts, shortcuts=dict(configopts('eggs', self.prog)), quote=False, trim=True, extrapaths=addedopts['pypath']) if pyenv: opts.add('cmdenv', pyenv) hadenv = envdef('HADOOP_CLASSPATH', addedopts['libjar'], 'libjar', self.opts, shortcuts=dict(configopts('jars', self.prog))) tmpfiles = [] for _file in opts.pop('file'): if _file.startswith('file://'): opts.add('file', _file[7:]) else: tmpfiles.append(_file) if tmpfiles: opts.add('jobconf', 'tmpfiles=%s' % ','.join(tmpfiles)) tmpjars = [] for jar in opts.pop('libjar'): if jar.startswith('file://'): opts.add('file', jar[7:]) else: tmpjars.append(jar) if tmpjars: opts.add('jobconf', 'tmpjars=%s' % ','.join(tmpjars)) cmd = hadoop + '/bin/hadoop jar ' + streamingjar retval = execute(cmd, opts, hadenv) if 'yes' in addedopts['delinputs']: inputs = opts['input'] for path in inputs: execute("%s/bin/hadoop fs -rmr '%s'" % (hadoop, path)) return retval
def __init__(self, prog, opts): Iteration.__init__(self, prog, opts) self.opts += Options(configopts('streaming', prog, self.opts)) hadoop_streaming = 'streaming_%s' % self.opts['hadoop'][0] self.opts += Options(configopts(hadoop_streaming, prog, self.opts))
def run(self): retval = Iteration.run(self) if retval != 0: return retval addedopts = getopts(self.opts, ['input', 'output', 'mapper', 'reducer', 'libegg', 'delinputs', 'cmdenv', 'pv', 'addpath', 'inputformat', 'outputformat', 'numreducetasks', 'python', 'pypath', 'sorttmpdir', 'sortbufsize']) (mapper, reducer) = (addedopts['mapper'][0], addedopts['reducer'][0]) if not addedopts['input'] or not addedopts['output']: print >> sys.stderr, 'ERROR: input or output not specified' return 1 inputs = reduce(operator.concat, (input.split(' ') for input in addedopts['input'])) output = addedopts['output'][0] pyenv = envdef('PYTHONPATH', addedopts['libegg'], shortcuts=dict(configopts('eggs', self.prog)), extrapaths=addedopts['pypath']) cmdenv = ' '.join("%s='%s'" % tuple(arg.split('=')) for arg in addedopts['cmdenv']) if addedopts['pv'] and addedopts['pv'][0] == 'yes': mpv = '| pv -s `du -b %s | cut -f 1` -cN map ' % ' '.join(inputs) (spv, rpv) = ('| pv -cN sort ', '| pv -cN reduce ') else: (mpv, spv, rpv) = ('', '', '') (sorttmpdir, sortbufsize) = ('', '') if addedopts['sorttmpdir']: sorttmpdir = "-T %s" % addedopts['sorttmpdir'][0] if addedopts['sortbufsize']: sortbufsize = "-S %s" % addedopts['sortbufsize'][0] python = addedopts['python'][0] encodepipe = pyenv + ' ' + python + \ ' -m dumbo.cmd encodepipe -file ' + ' -file '.join(inputs) if addedopts['inputformat'] and addedopts['inputformat'][0] == 'code': encodepipe += ' -alreadycoded yes' if addedopts['addpath'] and addedopts['addpath'][0] != 'no': encodepipe += ' -addpath yes' if addedopts['numreducetasks'] and addedopts['numreducetasks'][0] == '0': retval = execute("%s | %s %s %s %s > '%s'" % (encodepipe, pyenv, cmdenv, mapper, mpv, output)) else: retval = execute("%s | %s %s %s %s| LC_ALL=C sort %s %s %s| %s %s %s %s> '%s'" % (encodepipe, pyenv, cmdenv, mapper, mpv, sorttmpdir, sortbufsize, spv, pyenv, cmdenv, reducer, rpv, output)) if addedopts['delinputs'] and addedopts['delinputs'][0] == 'yes': for file in addedopts['input']: execute('rm ' + file) return retval
def __init__(self, prog, opts): Iteration.__init__(self, prog, opts) self.opts += configopts('unix', prog, self.opts)
def run(self): retval = Iteration.run(self) if retval != 0: return retval if os.path.exists(self.prog): self.opts.append(('file', self.prog)) addedopts = getopts(self.opts, ['hadoop', 'name', 'delinputs', 'libegg', 'libjar', 'inputformat', 'outputformat', 'nummaptasks', 'numreducetasks', 'priority', 'queue', 'cachefile', 'cachearchive', 'file', 'codewritable', 'addpath', 'getpath', 'python', 'streamoutput', 'pypath']) hadoop = findhadoop(addedopts['hadoop'][0]) streamingjar = findjar(hadoop, 'streaming') if not streamingjar: print >> sys.stderr, 'ERROR: Streaming jar not found' return 1 try: import typedbytes except ImportError: print >> sys.stderr, 'ERROR: "typedbytes" module not found' return 1 modpath = re.sub('\.egg.*$', '.egg', typedbytes.__file__) if modpath.endswith('.egg'): addedopts['libegg'].append(modpath) else: self.opts.append(('file', modpath)) self.opts.append(('jobconf', 'stream.map.input=typedbytes')) self.opts.append(('jobconf', 'stream.reduce.input=typedbytes')) if addedopts['numreducetasks'] and addedopts['numreducetasks'][0] == '0': self.opts.append(('jobconf', 'stream.reduce.output=typedbytes')) if addedopts['streamoutput']: id_ = addedopts['streamoutput'][0] self.opts.append(('jobconf', 'stream.map.output=' + id_)) else: self.opts.append(('jobconf', 'stream.map.output=typedbytes')) else: self.opts.append(('jobconf', 'stream.map.output=typedbytes')) if addedopts['streamoutput']: id_ = addedopts['streamoutput'][0] self.opts.append(('jobconf', 'stream.reduce.output=' + id_)) else: self.opts.append(('jobconf', 'stream.reduce.output=typedbytes')) if not addedopts['name']: self.opts.append(('jobconf', 'mapred.job.name=' + self.prog.split('/')[-1])) else: self.opts.append(('jobconf', 'mapred.job.name=%s' % addedopts['name'][0])) if addedopts['nummaptasks']: self.opts.append(('jobconf', 'mapred.map.tasks=%s' % addedopts['nummaptasks'][0])) if addedopts['numreducetasks']: numreducetasks = int(addedopts['numreducetasks'][0]) self.opts.append(('numReduceTasks', str(numreducetasks))) if addedopts['priority']: self.opts.append(('jobconf', 'mapred.job.priority=%s' % addedopts['priority'][0])) if addedopts['queue']: self.opts.append(('jobconf', 'mapred.job.queue.name=%s' % addedopts['queue'][0])) if addedopts['cachefile']: for cachefile in addedopts['cachefile']: self.opts.append(('cacheFile', cachefile)) if addedopts['cachearchive']: for cachearchive in addedopts['cachearchive']: self.opts.append(('cacheArchive', cachearchive)) if addedopts['file']: for file in addedopts['file']: if not '://' in file: if not os.path.exists(file): raise ValueError('file "' + file + '" does not exist') file = 'file://' + os.path.abspath(file) self.opts.append(('file', file)) if not addedopts['inputformat']: addedopts['inputformat'] = ['auto'] inputformat_shortcuts = \ {'code': 'org.apache.hadoop.streaming.AutoInputFormat', 'text': 'org.apache.hadoop.mapred.TextInputFormat', 'sequencefile': 'org.apache.hadoop.streaming.AutoInputFormat', 'auto': 'org.apache.hadoop.streaming.AutoInputFormat'} inputformat_shortcuts.update(configopts('inputformats', self.prog)) inputformat = addedopts['inputformat'][0] if inputformat_shortcuts.has_key(inputformat.lower()): inputformat = inputformat_shortcuts[inputformat.lower()] self.opts.append(('inputformat', inputformat)) if not addedopts['outputformat']: addedopts['outputformat'] = ['sequencefile'] if addedopts['getpath'] and addedopts['getpath'] != 'no': outputformat_shortcuts = \ {'code': 'fm.last.feathers.output.MultipleSequenceFiles', 'text': 'fm.last.feathers.output.MultipleTextFiles', 'raw': 'fm.last.feathers.output.MultipleRawFileOutputFormat', 'sequencefile': 'fm.last.feathers.output.MultipleSequenceFiles'} else: outputformat_shortcuts = \ {'code': 'org.apache.hadoop.mapred.SequenceFileOutputFormat', 'text': 'org.apache.hadoop.mapred.TextOutputFormat', 'raw': 'fm.last.feathers.output.RawFileOutputFormat', 'sequencefile': 'org.apache.hadoop.mapred.SequenceFileOutputFormat'} outputformat_shortcuts.update(configopts('outputformats', self.prog)) outputformat = addedopts['outputformat'][0] if outputformat_shortcuts.has_key(outputformat.lower()): outputformat = outputformat_shortcuts[outputformat.lower()] self.opts.append(('outputformat', outputformat)) if addedopts['addpath'] and addedopts['addpath'][0] != 'no': self.opts.append(('cmdenv', 'dumbo_addpath=true')) pyenv = envdef('PYTHONPATH', addedopts['libegg'], 'file', self.opts, shortcuts=dict(configopts('eggs', self.prog)), quote=False, trim=True, extrapaths=addedopts['pypath']) if pyenv: self.opts.append(('cmdenv', pyenv)) hadenv = envdef('HADOOP_CLASSPATH', addedopts['libjar'], 'libjar', self.opts, shortcuts=dict(configopts('jars', self.prog))) fileopt = getopt(self.opts, 'file') if fileopt: tmpfiles = [] for file in fileopt: if file.startswith('file://'): self.opts.append(('file', file[7:])) else: tmpfiles.append(file) if tmpfiles: self.opts.append(('jobconf', 'tmpfiles=' + ','.join(tmpfiles))) libjaropt = getopt(self.opts, 'libjar') if libjaropt: tmpjars = [] for jar in libjaropt: if jar.startswith('file://'): self.opts.append(('file', jar[7:])) else: tmpjars.append(jar) if tmpjars: self.opts.append(('jobconf', 'tmpjars=' + ','.join(tmpjars))) cmd = hadoop + '/bin/hadoop jar ' + streamingjar retval = execute(cmd, self.opts, hadenv) if addedopts['delinputs'] and addedopts['delinputs'][0] == 'yes': for (key, value) in self.opts: if key == 'input': if os.path.exists(hadoop + "/bin/hdfs"): hdfs = hadoop + "/bin/hdfs" else: hdfs = hadoop + "/bin/hadoop" execute("%s dfs -rmr '%s'" % (hdfs, value)) return retval
def run(self): retval = Iteration.run(self) if retval != 0: return retval opts = self.opts keys = [ "input", "output", "mapper", "reducer", "libegg", "delinputs", "cmdenv", "pv", "addpath", "inputformat", "outputformat", "numreducetasks", "python", "pypath", "sorttmpdir", "sortbufsize", ] addedopts = opts.filter(keys) opts.remove(*keys) mapper, reducer = addedopts["mapper"][0], addedopts["reducer"][0] if not addedopts["input"] or not addedopts["output"]: print >>sys.stderr, "ERROR: input or output not specified" return 1 _inputs = addedopts["input"] _output = addedopts["output"] inputs = reduce(operator.concat, (inp.split(" ") for inp in _inputs)) output = _output[0] pyenv = envdef( "PYTHONPATH", addedopts["libegg"], shortcuts=dict(configopts("eggs", self.prog)), extrapaths=addedopts["pypath"], ) cmdenv = " ".join("%s='%s'" % tuple(arg.split("=")) for arg in addedopts["cmdenv"]) if "yes" in addedopts["pv"]: mpv = "| pv -s `du -b %s | cut -f 1` -cN map " % " ".join(inputs) (spv, rpv) = ("| pv -cN sort ", "| pv -cN reduce ") else: (mpv, spv, rpv) = ("", "", "") sorttmpdir, sortbufsize = "", "" if addedopts["sorttmpdir"]: sorttmpdir = "-T %s" % addedopts["sorttmpdir"][0] if addedopts["sortbufsize"]: sortbufsize = "-S %s" % addedopts["sortbufsize"][0] python = addedopts["python"][0] encodepipe = pyenv + " " + python + " -m dumbo.cmd encodepipe -file " + " -file ".join(inputs) if "code" in addedopts["inputformat"]: encodepipe += " -alreadycoded yes" if addedopts["addpath"] and "no" not in addedopts["addpath"]: encodepipe += " -addpath yes" if addedopts["numreducetasks"] and addedopts["numreducetasks"][0] == "0": retval = execute("%s | %s %s %s %s > '%s'" % (encodepipe, pyenv, cmdenv, mapper, mpv, output)) else: retval = execute( "%s | %s %s %s %s| LC_ALL=C sort %s %s %s| %s %s %s %s> '%s'" % ( encodepipe, pyenv, cmdenv, mapper, mpv, sorttmpdir, sortbufsize, spv, pyenv, cmdenv, reducer, rpv, output, ) ) if "yes" in addedopts["delinputs"]: for _file in addedopts["input"]: execute("rm " + _file) return retval
def __init__(self, prog, opts): Iteration.__init__(self, prog, opts) self.opts += Options(configopts("unix", prog, self.opts))
def run(self): retval = Iteration.run(self) if retval != 0: return retval addedopts = getopts(self.opts, ['input', 'output', 'mapper', 'reducer', 'libegg', 'delinputs', 'cmdenv', 'inputformat', 'outputformat', 'numreducetasks', 'python', 'pypath', 'tmpdir', 'nmappers', 'nreducers', 'permapper', 'shell']) (mapper, reducer) = (addedopts['mapper'][0], addedopts['reducer'][0]) if not addedopts['input'] or not addedopts['output']: print >> sys.stderr, 'ERROR: input or output not specified' return 1 inputs = reduce(operator.concat, (input.split(' ') for input in addedopts['input'])) output = addedopts['output'][0] try: os.makedirs(output) except os.error as e: pass pyenv = envdef('PYTHONPATH', addedopts['libegg'], shortcuts=dict(configopts('eggs', self.prog)), extrapaths=addedopts['pypath']) cmdenv = ' '.join("%s='%s'" % tuple(arg.split('=')) for arg in addedopts['cmdenv']) shell = addedopts["shell"][0] python = addedopts['python'][0] mapTotal = len(inputs) mapDoneCount = [0] reduceDoneCount = [0] nMappers = int(addedopts["nmappers"][0]) nReducers = int(addedopts["nreducers"][0]) # this is the number of files that will be handed to each mapper permapper = int(addedopts["permapper"][0]) # start the mappers, reducers mPool = Pool(nMappers) rPool = Pool(nReducers) doReduces = not (addedopts['numreducetasks'] and addedopts['numreducetasks'][0] == '0') # set up the mapper output/reducer input directories tmpdir = os.sep.join([addedopts['tmpdir'][0], "%s_%06d" % (time.strftime("%Y-%m-%d_%H-%M-%S", time.gmtime()), random.randint(0, 999999))]) mLock = threading.Lock() mResults = {} rLock = threading.Lock() mByR = {} rStarted = set() rResults = {} # start the map status output copier mapErrOutputCopier = MapErrOutputCopier(tmpdir) mapErrOutputCopier.start() if doReduces: # start the copy threads to handle map outputs copyLock = threading.Lock() copyThreads = {} for i in range(nReducers): copyThreads[i] = ReduceInputCopier(tmpdir, i) copyThreads[i].start() # start the reduce status output copier reduceErrOutputCopier = ReduceErrOutputCopier(tmpdir) reduceErrOutputCopier.start() for i in range(nReducers): try: os.makedirs(os.sep.join([tmpdir, "r-%d" % i])) except os.error as e: pass mByR[i] = set() # do maps -- kick it all off if permapper == 1: for args in enumerate(inputs): i, filename = args args = pyenv, python, cmdenv, mapper, nReducers, tmpdir, output, addedopts, shell, i, [filename], doReduces mLock.acquire() mResults[i] = mPool.apply_async(doMap, args) mLock.release() else: # multiple files per mapper... remaining = list(inputs) i = 0 while remaining: args = pyenv, python, cmdenv, mapper, nReducers, tmpdir, output, addedopts, shell, i, remaining[:permapper], doReduces mLock.acquire() mResults[i] = mPool.apply_async(doMap, args) mLock.release() remaining = remaining[permapper:] i += 1 # need to reset the mapTotal variable since we have fewer tasks... mapTotal = i def reduceDone(): # did anything finish? rLock.acquire() done = [x for x in rResults if rResults[x].ready()] for args in done: del rResults[args] # cleanup rLock.release() for reducenum in done: #print "reduce %d done" % reducenum reduceDoneCount[0] += 1 reduceErrOutputCopier.reduce_done(reducenum) def mapDone(): # did anything finish? mLock.acquire() done = [x for x in mResults if mResults[x].ready()] for args in done: del mResults[args] # cleanup mLock.release() for args in done: i = args mapDoneCount[0] += 1 mapErrOutputCopier.map_done(i) if doReduces: #print "map %d done" % i # update the structures for reducenum in range(nReducers): # initiate the copy request... copyThreads[reducenum].map_done(i) rLock.acquire() mByR[reducenum].add(i) # see if we can signal that's all the copier will have to handle? if len(mByR[reducenum]) == mapTotal: copyThreads[reducenum].map_done(None) rLock.release() else: # just move the map output file (unsorted) to the output directory print "map %d done" % i def copyDone(): # did anything finish? copyLock.acquire() done = [x for x in copyThreads if not copyThreads[x].is_alive()] for rnum in done: del copyThreads[rnum] # cleanup copyLock.release() for rnum in done: rLock.acquire() rStarted.add(rnum) args = tmpdir, pyenv, cmdenv, reducer, output, shell, rnum rResults[rnum] = rPool.apply_async(doReduce, args) rLock.release() while reduceDoneCount[0] < nReducers: # check for things finishing... mapDone() copyDone() reduceDone() mLock.acquire() haveMaps = len(mResults) mLock.release() rLock.acquire() haveReduces = len(rResults) rLock.release() copyLock.acquire() copyRunning = len(copyThreads) copyLock.release() print "%d/%d/%d maps\t%d/%d copies\t%d/%d/%d reduces" % (haveMaps, mapDoneCount[0], mapTotal, copyRunning, nReducers, haveReduces, reduceDoneCount[0], nReducers) time.sleep(5) mPool.terminate() mPool.join() rPool.terminate() rPool.join() # make sure the map status output is done before cleaning up the tmp dir mapErrOutputCopier.map_done(None) mapErrOutputCopier.join() if doReduces: # make sure the reduce status output is done before cleaning up the tmp dir reduceErrOutputCopier.reduce_done(None) reduceErrOutputCopier.join() if not master_debug and len(os.listdir(tmpdir)) == 0: os.rmdir(tmpdir) return 0 # make sure we return an error if there is a problem.
def __init__(self, prog, opts): Iteration.__init__(self, prog, opts) self.opts += configopts("streaming", prog, self.opts) hadoop = getopt(self.opts, "hadoop", delete=False)[0] self.opts += configopts("streaming_" + hadoop, prog, self.opts)
def run(self): retval = Iteration.run(self) if retval != 0: return retval opts = self.opts if os.path.exists(self.prog): opts.add('file', self.prog) keys = ['hadoop', 'name', 'delinputs', 'libegg', 'libjar', 'inputformat', 'outputformat', 'nummaptasks', 'numreducetasks', 'priority', 'queue', 'cachefile', 'cachearchive', 'file', 'codewritable', 'addpath', 'getpath', 'python', 'streamoutput', 'pypath', 'hadooplib'] addedopts = opts.filter(keys) opts.remove(*keys) hadoop = findhadoop(addedopts['hadoop'][0]) streamingjar = findjar(hadoop, 'streaming', addedopts['hadooplib']) if not streamingjar: print >> sys.stderr, 'ERROR: Streaming jar not found' return 1 try: import typedbytes except ImportError: print >> sys.stderr, 'ERROR: "typedbytes" module not found' return 1 modpath = re.sub('\.egg.*$', '.egg', typedbytes.__file__) if modpath.endswith('.egg'): addedopts.add('libegg', modpath) else: opts.add('file', modpath) opts.add('jobconf', 'stream.map.input=typedbytes') opts.add('jobconf', 'stream.reduce.input=typedbytes') if addedopts['numreducetasks'] and addedopts['numreducetasks'][0] == '0': opts.add('jobconf', 'stream.reduce.output=typedbytes') if addedopts['streamoutput']: id_ = addedopts['streamoutput'][0] opts.add('jobconf', 'stream.map.output=' + id_) else: opts.add('jobconf', 'stream.map.output=typedbytes') else: opts.add('jobconf', 'stream.map.output=typedbytes') if addedopts['streamoutput']: id_ = addedopts['streamoutput'][0] opts.add('jobconf', 'stream.reduce.output=' + id_) else: opts.add('jobconf', 'stream.reduce.output=typedbytes') progname = self.prog.split('/')[-1] if not addedopts['name'] \ else addedopts['name'][0] opts.add('jobconf', 'mapred.job.name=%s' % progname) nummaptasks = addedopts['nummaptasks'] numreducetasks = addedopts['numreducetasks'] if nummaptasks: opts.add('jobconf', 'mapred.map.tasks=%s' % nummaptasks[0]) if numreducetasks: opts.add('numReduceTasks', numreducetasks[0]) if addedopts['priority']: opts.add('jobconf', 'mapred.job.priority=%s' % addedopts['priority'][0]) if addedopts['queue']: opts.add('jobconf', 'mapred.job.queue.name=%s' % addedopts['queue'][0]) for cachefile in addedopts['cachefile']: opts.add('cacheFile', cachefile) for cachearchive in addedopts['cachearchive']: opts.add('cacheArchive', cachearchive) for _file in addedopts['file']: if not '://' in _file: if not os.path.exists(_file): raise ValueError('file "%s" does not exist' % _file) _file = 'file://%s' % os.path.abspath(_file) opts.add('file', _file) if not addedopts['inputformat']: addedopts.add('inputformat', 'auto') inputformat_shortcuts = { 'code': 'org.apache.hadoop.streaming.AutoInputFormat', 'text': 'org.apache.hadoop.mapred.TextInputFormat', 'sequencefile': 'org.apache.hadoop.streaming.AutoInputFormat', 'auto': 'org.apache.hadoop.streaming.AutoInputFormat' } inputformat_shortcuts.update(configopts('inputformats', self.prog)) inputformat = addedopts['inputformat'][0] if inputformat.lower() in inputformat_shortcuts: inputformat = inputformat_shortcuts[inputformat.lower()] opts.add('inputformat', inputformat) if not addedopts['outputformat']: addedopts.add('outputformat', 'sequencefile') if addedopts['getpath'] and 'no' not in addedopts['getpath']: outputformat_shortcuts = { 'code': 'fm.last.feathers.output.MultipleSequenceFiles', 'text': 'fm.last.feathers.output.MultipleTextFiles', 'raw': 'fm.last.feathers.output.MultipleRawFileOutputFormat', 'sequencefile': 'fm.last.feathers.output.MultipleSequenceFiles' } else: outputformat_shortcuts = { 'code': 'org.apache.hadoop.mapred.SequenceFileOutputFormat', 'text': 'org.apache.hadoop.mapred.TextOutputFormat', 'raw': 'fm.last.feathers.output.RawFileOutputFormat', 'sequencefile': 'org.apache.hadoop.mapred.SequenceFileOutputFormat' } outputformat_shortcuts.update(configopts('outputformats', self.prog)) outputformat = addedopts['outputformat'][0] if outputformat.lower() in outputformat_shortcuts: outputformat = outputformat_shortcuts[outputformat.lower()] opts.add('outputformat', outputformat) if addedopts['addpath'] and 'no' not in addedopts['addpath']: opts.add('cmdenv', 'dumbo_addpath=true') pyenv = envdef('PYTHONPATH', addedopts['libegg'], 'file', self.opts, shortcuts=dict(configopts('eggs', self.prog)), quote=False, trim=True, extrapaths=addedopts['pypath']) if pyenv: opts.add('cmdenv', pyenv) hadenv = envdef('HADOOP_CLASSPATH', addedopts['libjar'], 'libjar', self.opts, shortcuts=dict(configopts('jars', self.prog))) tmpfiles = [] for _file in opts.pop('file'): if _file.startswith('file://'): opts.add('file', _file[7:]) else: tmpfiles.append(_file) if tmpfiles: opts.add('jobconf', 'tmpfiles=%s' % ','.join(tmpfiles)) tmpjars = [] for jar in opts.pop('libjar'): if jar.startswith('file://'): opts.add('file', jar[7:]) else: tmpjars.append(jar) if tmpjars: opts.add('jobconf', 'tmpjars=%s' % ','.join(tmpjars)) cmd = hadoop + '/bin/hadoop jar ' + streamingjar retval = execute(cmd, opts, hadenv) if 'yes' in addedopts['delinputs']: inputs = opts['input'] for path in inputs: execute("%s/bin/hadoop fs -rmr '%s'" % (hadoop, path)) return retval
def run(self): retval = Iteration.run(self) if retval != 0: return retval if os.path.exists(self.prog): self.opts.append(("file", self.prog)) addedopts = getopts( self.opts, [ "hadoop", "name", "delinputs", "libegg", "libjar", "inputformat", "outputformat", "nummaptasks", "numreducetasks", "priority", "queue", "cachefile", "cachearchive", "file", "codewritable", "addpath", "getpath", "python", "streamoutput", "pypath", ], ) hadoop = findhadoop(addedopts["hadoop"][0]) streamingjar = findjar(hadoop, "streaming") if not streamingjar: print >> sys.stderr, "ERROR: Streaming jar not found" return 1 try: import typedbytes except ImportError: print >> sys.stderr, 'ERROR: "typedbytes" module not found' return 1 modpath = re.sub("\.egg.*$", ".egg", typedbytes.__file__) if modpath.endswith(".egg"): addedopts["libegg"].append(modpath) else: self.opts.append(("file", modpath)) self.opts.append(("jobconf", "stream.map.input=typedbytes")) self.opts.append(("jobconf", "stream.reduce.input=typedbytes")) if addedopts["numreducetasks"] and addedopts["numreducetasks"][0] == "0": self.opts.append(("jobconf", "stream.reduce.output=typedbytes")) if addedopts["streamoutput"]: id_ = addedopts["streamoutput"][0] self.opts.append(("jobconf", "stream.map.output=" + id_)) else: self.opts.append(("jobconf", "stream.map.output=typedbytes")) else: self.opts.append(("jobconf", "stream.map.output=typedbytes")) if addedopts["streamoutput"]: id_ = addedopts["streamoutput"][0] self.opts.append(("jobconf", "stream.reduce.output=" + id_)) else: self.opts.append(("jobconf", "stream.reduce.output=typedbytes")) if not addedopts["name"]: self.opts.append(("jobconf", "mapred.job.name=" + self.prog.split("/")[-1])) else: self.opts.append(("jobconf", "mapred.job.name=%s" % addedopts["name"][0])) if addedopts["nummaptasks"]: self.opts.append(("jobconf", "mapred.map.tasks=%s" % addedopts["nummaptasks"][0])) if addedopts["numreducetasks"]: numreducetasks = int(addedopts["numreducetasks"][0]) self.opts.append(("numReduceTasks", str(numreducetasks))) if addedopts["priority"]: self.opts.append(("jobconf", "mapred.job.priority=%s" % addedopts["priority"][0])) if addedopts["queue"]: self.opts.append(("jobconf", "mapred.job.queue.name=%s" % addedopts["queue"][0])) if addedopts["cachefile"]: for cachefile in addedopts["cachefile"]: self.opts.append(("cacheFile", cachefile)) if addedopts["cachearchive"]: for cachearchive in addedopts["cachearchive"]: self.opts.append(("cacheArchive", cachearchive)) if addedopts["file"]: for file in addedopts["file"]: if not "://" in file: if not os.path.exists(file): raise ValueError('file "' + file + '" does not exist') file = "file://" + os.path.abspath(file) self.opts.append(("file", file)) if not addedopts["inputformat"]: addedopts["inputformat"] = ["auto"] inputformat_shortcuts = { "code": "org.apache.hadoop.streaming.AutoInputFormat", "text": "org.apache.hadoop.mapred.TextInputFormat", "sequencefile": "org.apache.hadoop.streaming.AutoInputFormat", "auto": "org.apache.hadoop.streaming.AutoInputFormat", } inputformat_shortcuts.update(configopts("inputformats", self.prog)) inputformat = addedopts["inputformat"][0] if inputformat_shortcuts.has_key(inputformat.lower()): inputformat = inputformat_shortcuts[inputformat.lower()] self.opts.append(("inputformat", inputformat)) if not addedopts["outputformat"]: addedopts["outputformat"] = ["sequencefile"] if addedopts["getpath"] and addedopts["getpath"] != "no": outputformat_shortcuts = { "code": "fm.last.feathers.output.MultipleSequenceFiles", "text": "fm.last.feathers.output.MultipleTextFiles", "raw": "fm.last.feathers.output.MultipleRawFileOutputFormat", "sequencefile": "fm.last.feathers.output.MultipleSequenceFiles", } else: outputformat_shortcuts = { "code": "org.apache.hadoop.mapred.SequenceFileOutputFormat", "text": "org.apache.hadoop.mapred.TextOutputFormat", "raw": "fm.last.feathers.output.RawFileOutputFormat", "sequencefile": "org.apache.hadoop.mapred.SequenceFileOutputFormat", } outputformat_shortcuts.update(configopts("outputformats", self.prog)) outputformat = addedopts["outputformat"][0] if outputformat_shortcuts.has_key(outputformat.lower()): outputformat = outputformat_shortcuts[outputformat.lower()] self.opts.append(("outputformat", outputformat)) if addedopts["addpath"] and addedopts["addpath"][0] != "no": self.opts.append(("cmdenv", "dumbo_addpath=true")) pyenv = envdef( "PYTHONPATH", addedopts["libegg"], "file", self.opts, shortcuts=dict(configopts("eggs", self.prog)), quote=False, trim=True, extrapaths=addedopts["pypath"], ) if pyenv: self.opts.append(("cmdenv", pyenv)) hadenv = envdef( "HADOOP_CLASSPATH", addedopts["libjar"], "libjar", self.opts, shortcuts=dict(configopts("jars", self.prog)) ) fileopt = getopt(self.opts, "file") if fileopt: tmpfiles = [] for file in fileopt: if file.startswith("file://"): self.opts.append(("file", file[7:])) else: tmpfiles.append(file) if tmpfiles: self.opts.append(("jobconf", "tmpfiles=" + ",".join(tmpfiles))) libjaropt = getopt(self.opts, "libjar") if libjaropt: tmpjars = [] for jar in libjaropt: if jar.startswith("file://"): self.opts.append(("file", jar[7:])) else: tmpjars.append(jar) if tmpjars: self.opts.append(("jobconf", "tmpjars=" + ",".join(tmpjars))) cmd = hadoop + "/bin/hadoop jar " + streamingjar retval = execute(cmd, self.opts, hadenv) if addedopts["delinputs"] and addedopts["delinputs"][0] == "yes": for (key, value) in self.opts: if key == "input": if os.path.exists(hadoop + "/bin/hdfs"): hdfs = hadoop + "/bin/hdfs" else: hdfs = hadoop + "/bin/hadoop" execute("%s dfs -rmr '%s'" % (hdfs, value)) return retval