Exemple #1
0
 def create_filesystem(self, opts):
     hadoopopt = getopt(opts, 'hadoop', delete=False)
     hadoopshort = hadoopopt[0]
     hadoopdir = findhadoop(hadoopopt[0])
     allopts = configopts('streaming')
     allopts += configopts('streaming_' + hadoopshort)
     streamingjar = getopt(allopts, 'streamingjar')
     if streamingjar:
         streamingjar = streamingjar[0]
     return StreamingFileSystem(hadoopdir, streamingjar)
Exemple #2
0
 def create_filesystem(self, opts):
     # are we given a specific shell?
     shell = getopt(opts, "shell", delete=False)
     if shell:
         return UnixFileSystem(shell[0])
     else:
         return UnixFileSystem()
Exemple #3
0
 def cat(self, path, opts):
     addedopts = getopts(opts, ['libjar'], delete=False)
     streamingjar = findjar(self.hadoop, 'streaming')
     if not streamingjar:
         print >> sys.stderr, 'ERROR: Streaming jar not found'
         return 1
     hadenv = envdef('HADOOP_CLASSPATH', addedopts['libjar'],
                     shortcuts=dict(configopts('jars')))
     try:
         import typedbytes
         ls = os.popen('%s %s dfs -ls %s' % (hadenv, self.hdfs, path))
         if sum(c in path for c in ("*", "?", "{")) > 0:
             # cat each file separately when the path contains special chars
             lineparts = (line.split()[-1] for line in ls)
             subpaths = [part for part in lineparts if part.startswith("/")]
         else:
             # we still do the ls even in this case to make sure we print errors 
             subpaths = [path]
         ls.close()
         for subpath in subpaths:
             if subpath.endswith("/_logs"):
                 continue
             dumptb = os.popen('%s %s/bin/hadoop jar %s dumptb %s 2> /dev/null'
                               % (hadenv, self.hadoop, streamingjar, subpath))
             ascodeopt = getopt(opts, 'ascode')
             if ascodeopt and ascodeopt[0] == 'yes':
                 outputs = dumpcode(typedbytes.PairedInput(dumptb))
             else:
                 outputs = dumptext(typedbytes.PairedInput(dumptb))
             for output in outputs:
                 print '\t'.join(output)
             dumptb.close()
     except IOError:
         pass  # ignore
     return 0
Exemple #4
0
 def cat(self, path, opts):
     addedopts = getopts(opts, ['libjar'], delete=False)
     streamingjar = findjar(self.hadoop, 'streaming')
     if not streamingjar:
         print >> sys.stderr, 'ERROR: Streaming jar not found'
         return 1
     hadenv = envdef('HADOOP_CLASSPATH', addedopts['libjar'],
                     shortcuts=dict(configopts('jars')))
     try:
         import typedbytes
         ls = os.popen('%s %s/bin/hadoop dfs -ls %s' % (hadenv, self.hadoop, path))
         if sum(c in path for c in ("*", "?", "{")) > 0:
             # cat each file separately when the path contains special chars
             lineparts = (line.split()[-1] for line in ls)
             subpaths = [part for part in lineparts if part.startswith("/")]
         else:
             # we still do the ls even in this case to make sure we print errors 
             subpaths = [path]
         ls.close()
         for subpath in subpaths:
             dumptb = os.popen('%s %s/bin/hadoop jar %s dumptb %s 2> /dev/null'
                               % (hadenv, self.hadoop, streamingjar, subpath))
             ascodeopt = getopt(opts, 'ascode')
             if ascodeopt and ascodeopt[0] == 'yes':
                 outputs = dumpcode(typedbytes.PairedInput(dumptb))
             else:
                 outputs = dumptext(typedbytes.PairedInput(dumptb))
             for output in outputs:
                 print '\t'.join(output)
             dumptb.close()
     except IOError:
         pass  # ignore
     return 0
Exemple #5
0
    def test_getopt(self):
        # Test for backward compatibility
        opts = []
        values = getopt(opts, 'input')
        self.assertEquals(values, [])
        self.assertEquals(opts, [])

        opts = [('param', 'p1'), ('param', 'p2'), ('input', '/dev/path')]
        values = getopt(opts, 'param')
        expected = ['p2', 'p1']
        self.assertEquals(set(values), set(expected))
        self.assertEquals(set(opts), set([('input', '/dev/path')]))

        opts = [('output', '/prod/path')]
        values = getopt(opts, 'output', delete=False)
        self.assertEquals(values, ['/prod/path'])
        self.assertEquals(opts, [('output', '/prod/path')])

        values = getopt(opts, 'output')
        self.assertEquals(values, ['/prod/path'])
        self.assertEquals(opts, [])
Exemple #6
0
    def launch(self, mapper, reducer=None, combiner=None, opts=None, *args, **kwargs):
        "Copied from dumbo.core.run"
        if not opts:
            opts = []

        if type(mapper) == str:
            opts.append(('mapper', mapper))
        elif hasattr(mapper, 'opts'):
            opts += mapper.opts
        if type(reducer) == str:
            opts.append(('reducer', reducer))
        elif hasattr(reducer, 'opts'):
            opts += reducer.opts
        if type(combiner) == str:
            opts.append(('combiner', combiner))

        opts += [
            ('param', 'FLOW_INPUTS=%s' % ';'.join(self.flow.inputs)),
            ('param', 'FLOW_OUTPUTS=%s' % ';'.join(self.flow.outputs)),
        ]

        opts += self.flow.opts

        opts = override_opts(opts, self.get_connect_opts())

        if not reducer:
            opts.append(('numreducetasks','0'))

        progopt = getopt(opts, 'prog')
        hadoopopt = getopt(opts, 'hadoop', delete=False)
        if hadoopopt:
            retval = StreamingIteration(progopt[0], opts).run()
        else:
            retval = UnixIteration(progopt[0], opts).run()
        if retval == 127:
            print >> sys.stderr, 'ERROR: Are you sure that "python" is on your path?'
        if retval != 0:
            sys.exit(retval)
Exemple #7
0
 def matches(self, opts):
     return bool(getopt(opts, "hadoop", delete=False))
Exemple #8
0
 def run(self):
     retval = Iteration.run(self)
     if retval != 0:
         return retval
     if os.path.exists(self.prog):
         self.opts.append(('file', self.prog))
     addedopts = getopts(self.opts, ['hadoop',
                                     'name',
                                     'delinputs',
                                     'libegg',
                                     'libjar',
                                     'libjarstreaming',
                                     'inputformat',
                                     'outputformat',
                                     'nummaptasks',
                                     'numreducetasks',
                                     'priority',
                                     'queue',
                                     'cachefile',
                                     'cachearchive',
                                     'file',
                                     'codewritable',
                                     'addpath',
                                     'getpath',
                                     'python',
                                     'streamoutput',
                                     'pypath'])
     hadoop = findhadoop(addedopts['hadoop'][0])
     streamingjar = getopt(self.opts,'streamingjar')
     if streamingjar is None or len(streamingjar)==0:
         streamingjar = findjar(hadoop,'streaming')
     else:
         streamingjar = streamingjar[0]
     if not streamingjar:
         print >> sys.stderr, 'ERROR: Streaming jar not found'
         return 1
         
     # add typedbytes to path
     try: 
         import typedbytes
     except ImportError:
         print >> sys.stderr, 'ERROR: "typedbytes" module not found'
         return 1
     modpath = re.sub('\.egg.*$', '.egg', typedbytes.__file__)
     if modpath.endswith('.egg'):            
         addedopts['libegg'].append(modpath)    
     else:
         self.opts.append(('file', modpath)) 
         
     # add ctypedbytes to job
     try: 
         import ctypedbytes
         print >>sys.stderr, 'INFO: "ctypedbytes" found!'
         modpath = re.sub('\.egg.*$', '.egg', ctypedbytes.__file__)
         if modpath.endswith('.egg'):            
             addedopts['libegg'].append(modpath)
     except ImportError:
         pass        
         
         
     self.opts.append(('jobconf', 'stream.map.input=typedbytes'))
     self.opts.append(('jobconf', 'stream.reduce.input=typedbytes'))
     if addedopts['numreducetasks'] and addedopts['numreducetasks'][0] == '0':
         self.opts.append(('jobconf', 'stream.reduce.output=typedbytes'))
         if addedopts['streamoutput']:
             id_ = addedopts['streamoutput'][0]
             self.opts.append(('jobconf', 'stream.map.output=' + id_))
         else: 
             self.opts.append(('jobconf', 'stream.map.output=typedbytes'))
     else:
         self.opts.append(('jobconf', 'stream.map.output=typedbytes'))
         if addedopts['streamoutput']:
             id_ = addedopts['streamoutput'][0]
             self.opts.append(('jobconf', 'stream.reduce.output=' + id_))
         else:
             self.opts.append(('jobconf', 'stream.reduce.output=typedbytes'))
     if not addedopts['name']:
         self.opts.append(('jobconf', 'mapred.job.name='
                           + self.prog.split('/')[-1]))
     else:
         self.opts.append(('jobconf', 'mapred.job.name=%s'
                           % addedopts['name'][0]))
     if addedopts['nummaptasks']:
         self.opts.append(('jobconf', 'mapred.map.tasks=%s'
                           % addedopts['nummaptasks'][0]))
     if addedopts['numreducetasks']:
         numreducetasks = int(addedopts['numreducetasks'][0])
         self.opts.append(('numReduceTasks', str(numreducetasks)))
     if addedopts['priority']:
         self.opts.append(('jobconf', 'mapred.job.priority=%s'
                           % addedopts['priority'][0]))
     if addedopts['queue']:
         self.opts.append(('jobconf', 'mapred.job.queue.name=%s'
                           % addedopts['queue'][0]))
     if addedopts['cachefile']:
         for cachefile in addedopts['cachefile']:
             self.opts.append(('cacheFile', cachefile))
     if addedopts['cachearchive']:
         for cachearchive in addedopts['cachearchive']:
             self.opts.append(('cacheArchive', cachearchive))
     if addedopts['file']:
         for file in addedopts['file']:
             if not '://' in file:
                 if not os.path.exists(file):
                     raise ValueError('file "' + file + '" does not exist')
                 file = 'file://' + os.path.abspath(file)
             self.opts.append(('file', file))
     if not addedopts['inputformat']:
         addedopts['inputformat'] = ['auto']
     inputformat_shortcuts = \
         {'code': 'org.apache.hadoop.streaming.AutoInputFormat',
          'text': 'org.apache.hadoop.mapred.TextInputFormat',
          'sequencefile': 'org.apache.hadoop.streaming.AutoInputFormat',
          'auto': 'org.apache.hadoop.streaming.AutoInputFormat'}
     inputformat_shortcuts.update(configopts('inputformats', self.prog))
     inputformat = addedopts['inputformat'][0]
     if inputformat_shortcuts.has_key(inputformat.lower()):
         inputformat = inputformat_shortcuts[inputformat.lower()]
     self.opts.append(('inputformat', inputformat))
     if not addedopts['outputformat']:
         addedopts['outputformat'] = ['sequencefile']
     if addedopts['getpath'] and addedopts['getpath'] != 'no':
         outputformat_shortcuts = \
             {'code': 'fm.last.feathers.output.MultipleSequenceFiles',
              'text': 'fm.last.feathers.output.MultipleTextFiles',               
              'raw': 'fm.last.feathers.output.MultipleRawFileOutputFormat',
              'sequencefile': 'fm.last.feathers.output.MultipleSequenceFiles'}
     else:
         outputformat_shortcuts = \
             {'code': 'org.apache.hadoop.mapred.SequenceFileOutputFormat',
              'text': 'org.apache.hadoop.mapred.TextOutputFormat',
              'raw': 'fm.last.feathers.output.RawFileOutputFormat',
              'sequencefile': 'org.apache.hadoop.mapred.SequenceFileOutputFormat'}
     outputformat_shortcuts.update(configopts('outputformats', self.prog))
     outputformat = addedopts['outputformat'][0]
     if outputformat_shortcuts.has_key(outputformat.lower()):
         outputformat = outputformat_shortcuts[outputformat.lower()]
     self.opts.append(('outputformat', outputformat))
     if addedopts['addpath'] and addedopts['addpath'][0] != 'no':
         self.opts.append(('cmdenv', 'dumbo_addpath=true'))
     pyenv = envdef('PYTHONPATH',
                    addedopts['libegg'],
                    'file',
                    self.opts,
                    shortcuts=dict(configopts('eggs', self.prog)),
                    quote=False,
                    trim=True,
                    extrapaths=addedopts['pypath'])
     if pyenv:
         self.opts.append(('cmdenv', pyenv))
     if addedopts['libjarstreaming'] and addedopts['libjarstreaming'][0] != 'no':
         addedopts['libjar'].append(streamingjar)
     hadenv = envdef('HADOOP_CLASSPATH', addedopts['libjar'], 'libjar', 
                     self.opts, shortcuts=dict(configopts('jars', self.prog)))
     fileopt = getopt(self.opts, 'file')
     if fileopt:
         tmpfiles = []
         for file in fileopt:
             if file.startswith('file://'):
                 self.opts.append(('file', file[7:]))
             else:
                 tmpfiles.append(file)
         if tmpfiles:
             self.opts.append(('jobconf', 'tmpfiles=' + ','.join(tmpfiles)))
     libjaropt = getopt(self.opts, 'libjar')
     if libjaropt:
         tmpjars = []
         for jar in libjaropt:
             if jar.startswith('file://'):
                 self.opts.append(('file', jar[7:]))
             else:
                 tmpjars.append(jar)
         if tmpjars:
             self.opts.append(('jobconf', 'tmpjars=' + ','.join(tmpjars)))
     cmd = hadoop + '/bin/hadoop jar ' + streamingjar
     retval = execute(cmd, self.opts, hadenv)
     if addedopts['delinputs'] and addedopts['delinputs'][0] == 'yes':
         for (key, value) in self.opts:
             if key == 'input':
                 if os.path.exists(hadoop + "/bin/hdfs"):
                     hdfs = hadoop + "/bin/hdfs"
                 else:
                     hdfs = hadoop + "/bin/hadoop"
                 execute("%s dfs -rmr '%s'" % (hdfs, value))
     return retval
Exemple #9
0
 def __init__(self, prog, opts):
     Iteration.__init__(self, prog, opts)
     self.opts += configopts('streaming', prog, self.opts)
     hadoop = getopt(self.opts, 'hadoop', delete=False)[0]
     self.opts += configopts('streaming_' + hadoop, prog, self.opts)
Exemple #10
0
 def create_filesystem(self, opts):
     hadoopopt = getopt(opts, 'hadoop', delete=False)
     return StreamingFileSystem(findhadoop(hadoopopt[0]))
Exemple #11
0
 def cat(self, path, opts):
     ascodeopt = getopt(opts, 'ascode')
     if ascodeopt and ascodeopt[0] == 'yes':
         return self._cat(path, opts, dumpcode, outputs=True)
     else:
         return self._cat(path, opts, dumptext, outputs=True)
Exemple #12
0
 def run(self):
     retval = Iteration.run(self)
     if retval != 0:
         return retval
     if os.path.exists(self.prog):
         self.opts.append(('file', self.prog))
     addedopts = getopts(self.opts, ['hadoop',
                                     'name',
                                     'delinputs',
                                     'libegg',
                                     'libjar',
                                     'inputformat',
                                     'outputformat',
                                     'nummaptasks',
                                     'numreducetasks',
                                     'priority',
                                     'queue',
                                     'cachefile',
                                     'cachearchive',
                                     'file',
                                     'codewritable',
                                     'addpath',
                                     'getpath',
                                     'python',
                                     'streamoutput',
                                     'pypath'])
     hadoop = findhadoop(addedopts['hadoop'][0])
     streamingjar = findjar(hadoop, 'streaming')
     if not streamingjar:
         print >> sys.stderr, 'ERROR: Streaming jar not found'
         return 1
     try: 
         import typedbytes
     except ImportError:
         print >> sys.stderr, 'ERROR: "typedbytes" module not found'
         return 1
     modpath = re.sub('\.egg.*$', '.egg', typedbytes.__file__)
     if modpath.endswith('.egg'):            
         addedopts['libegg'].append(modpath)    
     else:
         self.opts.append(('file', modpath)) 
     self.opts.append(('jobconf', 'stream.map.input=typedbytes'))
     self.opts.append(('jobconf', 'stream.reduce.input=typedbytes'))
     if addedopts['numreducetasks'] and addedopts['numreducetasks'][0] == '0':
         self.opts.append(('jobconf', 'stream.reduce.output=typedbytes'))
         if addedopts['streamoutput']:
             id_ = addedopts['streamoutput'][0]
             self.opts.append(('jobconf', 'stream.map.output=' + id_))
         else: 
             self.opts.append(('jobconf', 'stream.map.output=typedbytes'))
     else:
         self.opts.append(('jobconf', 'stream.map.output=typedbytes'))
         if addedopts['streamoutput']:
             id_ = addedopts['streamoutput'][0]
             self.opts.append(('jobconf', 'stream.reduce.output=' + id_))
         else:
             self.opts.append(('jobconf', 'stream.reduce.output=typedbytes'))
     if not addedopts['name']:
         self.opts.append(('jobconf', 'mapred.job.name='
                           + self.prog.split('/')[-1]))
     else:
         self.opts.append(('jobconf', 'mapred.job.name=%s'
                           % addedopts['name'][0]))
     if addedopts['nummaptasks']:
         self.opts.append(('jobconf', 'mapred.map.tasks=%s'
                           % addedopts['nummaptasks'][0]))
     if addedopts['numreducetasks']:
         numreducetasks = int(addedopts['numreducetasks'][0])
         self.opts.append(('numReduceTasks', str(numreducetasks)))
     if addedopts['priority']:
         self.opts.append(('jobconf', 'mapred.job.priority=%s'
                           % addedopts['priority'][0]))
     if addedopts['queue']:
         self.opts.append(('jobconf', 'mapred.job.queue.name=%s'
                           % addedopts['queue'][0]))
     if addedopts['cachefile']:
         for cachefile in addedopts['cachefile']:
             self.opts.append(('cacheFile', cachefile))
     if addedopts['cachearchive']:
         for cachearchive in addedopts['cachearchive']:
             self.opts.append(('cacheArchive', cachearchive))
     if addedopts['file']:
         for file in addedopts['file']:
             if not '://' in file:
                 if not os.path.exists(file):
                     raise ValueError('file "' + file + '" does not exist')
                 file = 'file://' + os.path.abspath(file)
             self.opts.append(('file', file))
     if not addedopts['inputformat']:
         addedopts['inputformat'] = ['auto']
     inputformat_shortcuts = \
         {'code': 'org.apache.hadoop.streaming.AutoInputFormat',
          'text': 'org.apache.hadoop.mapred.TextInputFormat',
          'sequencefile': 'org.apache.hadoop.streaming.AutoInputFormat',
          'auto': 'org.apache.hadoop.streaming.AutoInputFormat'}
     inputformat_shortcuts.update(configopts('inputformats', self.prog))
     inputformat = addedopts['inputformat'][0]
     if inputformat_shortcuts.has_key(inputformat.lower()):
         inputformat = inputformat_shortcuts[inputformat.lower()]
     self.opts.append(('inputformat', inputformat))
     if not addedopts['outputformat']:
         addedopts['outputformat'] = ['sequencefile']
     if addedopts['getpath'] and addedopts['getpath'] != 'no':
         outputformat_shortcuts = \
             {'code': 'fm.last.feathers.output.MultipleSequenceFiles',
              'text': 'fm.last.feathers.output.MultipleTextFiles',               
              'raw': 'fm.last.feathers.output.MultipleRawFileOutputFormat',
              'sequencefile': 'fm.last.feathers.output.MultipleSequenceFiles'}
     else:
         outputformat_shortcuts = \
             {'code': 'org.apache.hadoop.mapred.SequenceFileOutputFormat',
              'text': 'org.apache.hadoop.mapred.TextOutputFormat',
              'raw': 'fm.last.feathers.output.RawFileOutputFormat',
              'sequencefile': 'org.apache.hadoop.mapred.SequenceFileOutputFormat'}
     outputformat_shortcuts.update(configopts('outputformats', self.prog))
     outputformat = addedopts['outputformat'][0]
     if outputformat_shortcuts.has_key(outputformat.lower()):
         outputformat = outputformat_shortcuts[outputformat.lower()]
     self.opts.append(('outputformat', outputformat))
     if addedopts['addpath'] and addedopts['addpath'][0] != 'no':
         self.opts.append(('cmdenv', 'dumbo_addpath=true'))
     pyenv = envdef('PYTHONPATH',
                    addedopts['libegg'],
                    'file',
                    self.opts,
                    shortcuts=dict(configopts('eggs', self.prog)),
                    quote=False,
                    trim=True,
                    extrapaths=addedopts['pypath'])
     if pyenv:
         self.opts.append(('cmdenv', pyenv))
     hadenv = envdef('HADOOP_CLASSPATH', addedopts['libjar'], 'libjar', 
                     self.opts, shortcuts=dict(configopts('jars', self.prog)))
     fileopt = getopt(self.opts, 'file')
     if fileopt:
         tmpfiles = []
         for file in fileopt:
             if file.startswith('file://'):
                 self.opts.append(('file', file[7:]))
             else:
                 tmpfiles.append(file)
         if tmpfiles:
             self.opts.append(('jobconf', 'tmpfiles=' + ','.join(tmpfiles)))
     libjaropt = getopt(self.opts, 'libjar')
     if libjaropt:
         tmpjars = []
         for jar in libjaropt:
             if jar.startswith('file://'):
                 self.opts.append(('file', jar[7:]))
             else:
                 tmpjars.append(jar)
         if tmpjars:
             self.opts.append(('jobconf', 'tmpjars=' + ','.join(tmpjars)))
     cmd = hadoop + '/bin/hadoop jar ' + streamingjar
     retval = execute(cmd, self.opts, hadenv)
     if addedopts['delinputs'] and addedopts['delinputs'][0] == 'yes':
         for (key, value) in self.opts:
             if key == 'input':
                 if os.path.exists(hadoop + "/bin/hdfs"):
                     hdfs = hadoop + "/bin/hdfs"
                 else:
                     hdfs = hadoop + "/bin/hadoop"
                 execute("%s dfs -rmr '%s'" % (hdfs, value))
     return retval
Exemple #13
0
 def __init__(self, prog, opts):
     Iteration.__init__(self, prog, opts)
     self.opts += configopts("streaming", prog, self.opts)
     hadoop = getopt(self.opts, "hadoop", delete=False)[0]
     self.opts += configopts("streaming_" + hadoop, prog, self.opts)
Exemple #14
0
 def matches(self, opts):
     return bool(getopt(opts, 'punix', delete=False))
Exemple #15
0
def main(module=None):
    if module is None:
        import __main__
        module = __main__

    intask = len(sys.argv) > 1 and sys.argv[1][0] != '-'
    opts = parseargs(sys.argv[1:])

    if intask:
        input_paths = os.environ['FLOW_INPUTS'].split(';')
        output_paths = os.environ['FLOW_OUTPUTS'].split(';')
    else:
        sequential = 'yes' in getopt(opts, 'seq')

        input_paths = getopt(opts, 'input')
        output_paths = getopt(opts, 'output')

        if any(';' in path for path in input_paths):
            print >> sys.stderr, "ERROR: Input paths cannot contain semi-colons"
            sys.exit(1)
        if any(';' in path for path in output_paths):
            print >> sys.stderr, "ERROR: Output paths cannot contain semi-colons"
            sys.exit(1)

    if any(',' in path for path in output_paths):
        print >> sys.stderr, "ERROR: Output paths cannot contain commas"
        sys.exit(1)

    print >> sys.stderr, "INFO: Flow inputs: %s" % input_paths
    print >> sys.stderr, "INFO: Flow outputs: %s" % output_paths

    flow = Flow(opts, input_paths, output_paths)

    # call special init function to initialize the flow
    positional_inputs = []
    named_inputs = {}
    
    for path_string in input_paths:
        if '=' in path_string:
            name, value = path_string.split('=',1)
            named_inputs[name] = ResultSet.from_string(value)
        else:
            positional_inputs.append(ResultSet.from_string(path_string))

    outputs = module.init(flow, *positional_inputs, **named_inputs)

    if type(outputs) is ResultSet:
        outputs = [outputs]

    for resultset, path in zip(outputs, output_paths):
        if len(resultset) > 1:
            print >> sys.stderr, "ERROR: Final outputs must be singleton resultsets"
            sys.exit(1)
        output = resultset[0]
        output.path = path
        output.temporary = False

    if intask:
        iterarg = 0
        if len(sys.argv) > 2:
            iterarg = int(sys.argv[2])

        flow.run_task(iterarg)
    else:
        if sequential:
            flow.run_all_sequential()
        else:
            flow.run_all()
 def create_iteration(self, opts):
     progopt = getopt(opts, 'prog')
     return UnixIteration(progopt[0], opts)
Exemple #17
0
 def create_iteration(self, opts):
     progopt = getopt(opts, "prog")
     return StreamingIteration(progopt[0], opts)
Exemple #18
0
 def matches(self, opts):
     return bool(getopt(opts, 'hadoop', delete=False))
Exemple #19
0
 def create_filesystem(self, opts):
     hadoopopt = getopt(opts, "hadoop", delete=False)
     return StreamingFileSystem(findhadoop(hadoopopt[0]))
Exemple #20
0
 def create_iteration(self, opts):
     progopt = getopt(opts, 'prog')
     return UnixIteration(progopt[0], opts)
Exemple #21
0
 def run(self):
     retval = Iteration.run(self)
     if retval != 0:
         return retval
     if os.path.exists(self.prog):
         self.opts.append(("file", self.prog))
     addedopts = getopts(
         self.opts,
         [
             "hadoop",
             "name",
             "delinputs",
             "libegg",
             "libjar",
             "inputformat",
             "outputformat",
             "nummaptasks",
             "numreducetasks",
             "priority",
             "queue",
             "cachefile",
             "cachearchive",
             "file",
             "codewritable",
             "addpath",
             "getpath",
             "python",
             "streamoutput",
             "pypath",
         ],
     )
     hadoop = findhadoop(addedopts["hadoop"][0])
     streamingjar = findjar(hadoop, "streaming")
     if not streamingjar:
         print >> sys.stderr, "ERROR: Streaming jar not found"
         return 1
     try:
         import typedbytes
     except ImportError:
         print >> sys.stderr, 'ERROR: "typedbytes" module not found'
         return 1
     modpath = re.sub("\.egg.*$", ".egg", typedbytes.__file__)
     if modpath.endswith(".egg"):
         addedopts["libegg"].append(modpath)
     else:
         self.opts.append(("file", modpath))
     self.opts.append(("jobconf", "stream.map.input=typedbytes"))
     self.opts.append(("jobconf", "stream.reduce.input=typedbytes"))
     if addedopts["numreducetasks"] and addedopts["numreducetasks"][0] == "0":
         self.opts.append(("jobconf", "stream.reduce.output=typedbytes"))
         if addedopts["streamoutput"]:
             id_ = addedopts["streamoutput"][0]
             self.opts.append(("jobconf", "stream.map.output=" + id_))
         else:
             self.opts.append(("jobconf", "stream.map.output=typedbytes"))
     else:
         self.opts.append(("jobconf", "stream.map.output=typedbytes"))
         if addedopts["streamoutput"]:
             id_ = addedopts["streamoutput"][0]
             self.opts.append(("jobconf", "stream.reduce.output=" + id_))
         else:
             self.opts.append(("jobconf", "stream.reduce.output=typedbytes"))
     if not addedopts["name"]:
         self.opts.append(("jobconf", "mapred.job.name=" + self.prog.split("/")[-1]))
     else:
         self.opts.append(("jobconf", "mapred.job.name=%s" % addedopts["name"][0]))
     if addedopts["nummaptasks"]:
         self.opts.append(("jobconf", "mapred.map.tasks=%s" % addedopts["nummaptasks"][0]))
     if addedopts["numreducetasks"]:
         numreducetasks = int(addedopts["numreducetasks"][0])
         self.opts.append(("numReduceTasks", str(numreducetasks)))
     if addedopts["priority"]:
         self.opts.append(("jobconf", "mapred.job.priority=%s" % addedopts["priority"][0]))
     if addedopts["queue"]:
         self.opts.append(("jobconf", "mapred.job.queue.name=%s" % addedopts["queue"][0]))
     if addedopts["cachefile"]:
         for cachefile in addedopts["cachefile"]:
             self.opts.append(("cacheFile", cachefile))
     if addedopts["cachearchive"]:
         for cachearchive in addedopts["cachearchive"]:
             self.opts.append(("cacheArchive", cachearchive))
     if addedopts["file"]:
         for file in addedopts["file"]:
             if not "://" in file:
                 if not os.path.exists(file):
                     raise ValueError('file "' + file + '" does not exist')
                 file = "file://" + os.path.abspath(file)
             self.opts.append(("file", file))
     if not addedopts["inputformat"]:
         addedopts["inputformat"] = ["auto"]
     inputformat_shortcuts = {
         "code": "org.apache.hadoop.streaming.AutoInputFormat",
         "text": "org.apache.hadoop.mapred.TextInputFormat",
         "sequencefile": "org.apache.hadoop.streaming.AutoInputFormat",
         "auto": "org.apache.hadoop.streaming.AutoInputFormat",
     }
     inputformat_shortcuts.update(configopts("inputformats", self.prog))
     inputformat = addedopts["inputformat"][0]
     if inputformat_shortcuts.has_key(inputformat.lower()):
         inputformat = inputformat_shortcuts[inputformat.lower()]
     self.opts.append(("inputformat", inputformat))
     if not addedopts["outputformat"]:
         addedopts["outputformat"] = ["sequencefile"]
     if addedopts["getpath"] and addedopts["getpath"] != "no":
         outputformat_shortcuts = {
             "code": "fm.last.feathers.output.MultipleSequenceFiles",
             "text": "fm.last.feathers.output.MultipleTextFiles",
             "raw": "fm.last.feathers.output.MultipleRawFileOutputFormat",
             "sequencefile": "fm.last.feathers.output.MultipleSequenceFiles",
         }
     else:
         outputformat_shortcuts = {
             "code": "org.apache.hadoop.mapred.SequenceFileOutputFormat",
             "text": "org.apache.hadoop.mapred.TextOutputFormat",
             "raw": "fm.last.feathers.output.RawFileOutputFormat",
             "sequencefile": "org.apache.hadoop.mapred.SequenceFileOutputFormat",
         }
     outputformat_shortcuts.update(configopts("outputformats", self.prog))
     outputformat = addedopts["outputformat"][0]
     if outputformat_shortcuts.has_key(outputformat.lower()):
         outputformat = outputformat_shortcuts[outputformat.lower()]
     self.opts.append(("outputformat", outputformat))
     if addedopts["addpath"] and addedopts["addpath"][0] != "no":
         self.opts.append(("cmdenv", "dumbo_addpath=true"))
     pyenv = envdef(
         "PYTHONPATH",
         addedopts["libegg"],
         "file",
         self.opts,
         shortcuts=dict(configopts("eggs", self.prog)),
         quote=False,
         trim=True,
         extrapaths=addedopts["pypath"],
     )
     if pyenv:
         self.opts.append(("cmdenv", pyenv))
     hadenv = envdef(
         "HADOOP_CLASSPATH", addedopts["libjar"], "libjar", self.opts, shortcuts=dict(configopts("jars", self.prog))
     )
     fileopt = getopt(self.opts, "file")
     if fileopt:
         tmpfiles = []
         for file in fileopt:
             if file.startswith("file://"):
                 self.opts.append(("file", file[7:]))
             else:
                 tmpfiles.append(file)
         if tmpfiles:
             self.opts.append(("jobconf", "tmpfiles=" + ",".join(tmpfiles)))
     libjaropt = getopt(self.opts, "libjar")
     if libjaropt:
         tmpjars = []
         for jar in libjaropt:
             if jar.startswith("file://"):
                 self.opts.append(("file", jar[7:]))
             else:
                 tmpjars.append(jar)
         if tmpjars:
             self.opts.append(("jobconf", "tmpjars=" + ",".join(tmpjars)))
     cmd = hadoop + "/bin/hadoop jar " + streamingjar
     retval = execute(cmd, self.opts, hadenv)
     if addedopts["delinputs"] and addedopts["delinputs"][0] == "yes":
         for (key, value) in self.opts:
             if key == "input":
                 if os.path.exists(hadoop + "/bin/hdfs"):
                     hdfs = hadoop + "/bin/hdfs"
                 else:
                     hdfs = hadoop + "/bin/hadoop"
                 execute("%s dfs -rmr '%s'" % (hdfs, value))
     return retval
Exemple #22
0
 def __init__(self, prog, opts):
     Iteration.__init__(self, prog, opts)
     self.opts += configopts('streaming', prog, self.opts)
     hadoop = getopt(self.opts, 'hadoop', delete=False)[0]
     self.opts += configopts('streaming_' + hadoop, prog, self.opts)